tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

DOMtoATK.h (5139B)


      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include <glib.h>
      8 #include "mozilla/a11y/HyperTextAccessibleBase.h"
      9 #include "nsCharTraits.h"
     10 #include "nsString.h"
     11 
     12 /**
     13 * ATK offsets are counted in unicode codepoints, while DOM offsets are counted
     14 * in UTF-16 code units.  That makes a difference for non-BMP characters,
     15 * which need two UTF-16 code units to be represented (a pair of surrogates),
     16 * while they are just one unicode character.
     17 *
     18 * To keep synchronization between ATK offsets (unicode codepoints) and DOM
     19 * offsets (UTF-16 code units), after translation from UTF-16 to UTF-8 we add a
     20 * BOM after each non-BMP character (which would otherwise use 2 UTF-16
     21 * code units for only 1 unicode codepoint).
     22 *
     23 * BOMs (Byte Order Marks, U+FEFF, also known as ZERO WIDTH NO-BREAK SPACE, but
     24 * that usage is deprecated) normally only appear at the beginning of unicode
     25 * files, but their occurrence within text (notably after cut&paste) is not
     26 * uncommon, and are thus considered as non-text.
     27 *
     28 * Since the selection requested through ATK may not contain both surrogates
     29 * at the ends of the selection, we need to fetch one UTF-16 code point more
     30 * on both side, and get rid of it before returning the string to ATK. The
     31 * ATKStringConverterHelper class maintains this, NewATKString should be used
     32 * to call it properly.
     33 *
     34 * In the end,
     35 * - if the start is between the high and low surrogates, the UTF-8 result
     36 * includes a BOM from it but not the character
     37 * - if the end is between the high and low surrogates, the UTF-8 result
     38 * includes the character but *not* the BOM
     39 * - all non-BMP characters that are fully in the string are in the UTF-8 result
     40 * as character followed by BOM
     41 */
     42 namespace mozilla {
     43 namespace a11y {
     44 
     45 namespace DOMtoATK {
     46 
     47 /**
     48 * Converts a string of accessible text into ATK gchar* string (by adding
     49 * BOMs). This can be used when offsets do not need to be adjusted because
     50 * ends of the string can not fall between surrogates.
     51 */
     52 gchar* Convert(const nsAString& aStr);
     53 
     54 /**
     55 * Add a BOM after each non-BMP character.
     56 */
     57 void AddBOMs(nsACString& aDest, const nsACString& aSource);
     58 
     59 class ATKStringConverterHelper {
     60 public:
     61  ATKStringConverterHelper(void)
     62      :
     63 #ifdef DEBUG
     64        mAdjusted(false),
     65 #endif
     66        mStartShifted(false),
     67        mEndShifted(false) {
     68  }
     69 
     70  /**
     71   * In order to properly get non-BMP values, offsets need to be changed
     72   * to get one character more on each end, so that ConvertUTF16toUTF8 can
     73   * convert surrogates even if the originally requested offsets fall between
     74   * them.
     75   */
     76  void AdjustOffsets(gint* aStartOffset, gint* aEndOffset, gint count);
     77 
     78  /**
     79   * Converts a string of accessible text with adjusted offsets into ATK
     80   * gchar* string (by adding BOMs).  Note, AdjustOffsets has to be called
     81   * before getting the text passed to this.
     82   */
     83  gchar* ConvertAdjusted(const nsAString& aStr);
     84 
     85 private:
     86  /**
     87   * Remove the additional characters requested by PrepareUTF16toUTF8.
     88   */
     89  gchar* FinishUTF16toUTF8(nsCString& aStr);
     90 
     91 #ifdef DEBUG
     92  bool mAdjusted;
     93 #endif
     94  bool mStartShifted;
     95  bool mEndShifted;
     96 };
     97 
     98 /**
     99 * Get text from aAccessible, using ATKStringConverterHelper to properly
    100 * introduce appropriate BOMs.
    101 */
    102 inline gchar* NewATKString(HyperTextAccessibleBase* aAccessible,
    103                           gint aStartOffset, gint aEndOffset) {
    104  gint startOffset = aStartOffset, endOffset = aEndOffset;
    105  ATKStringConverterHelper converter;
    106  converter.AdjustOffsets(&startOffset, &endOffset,
    107                          gint(aAccessible->CharacterCount()));
    108  nsAutoString str;
    109  aAccessible->TextSubstring(startOffset, endOffset, str);
    110 
    111  if (str.Length() == 0) {
    112    // Bogus offsets, or empty string, either way we do not need conversion.
    113    return g_strdup("");
    114  }
    115 
    116  return converter.ConvertAdjusted(str);
    117 }
    118 
    119 /**
    120 * Get a character from aAccessible, fetching more data as appropriate to
    121 * properly get non-BMP characters or a BOM as appropriate.
    122 */
    123 inline gunichar ATKCharacter(HyperTextAccessibleBase* aAccessible,
    124                             gint aOffset) {
    125  // char16_t is unsigned short in Mozilla, gnuichar is guint32 in glib.
    126  gunichar character = static_cast<gunichar>(aAccessible->CharAt(aOffset));
    127 
    128  if (NS_IS_LOW_SURROGATE(character)) {
    129    // Trailing surrogate, return BOM instead.
    130    return 0xFEFF;
    131  }
    132 
    133  if (NS_IS_HIGH_SURROGATE(character)) {
    134    // Heading surrogate, get the trailing surrogate and combine them.
    135    gunichar characterLow =
    136        static_cast<gunichar>(aAccessible->CharAt(aOffset + 1));
    137 
    138    if (!NS_IS_LOW_SURROGATE(characterLow)) {
    139      // It should have been a trailing surrogate... Flag the error.
    140      return 0xFFFD;
    141    }
    142    return SURROGATE_TO_UCS4(character, characterLow);
    143  }
    144 
    145  return character;
    146 }
    147 
    148 }  // namespace DOMtoATK
    149 
    150 }  // namespace a11y
    151 }  // namespace mozilla