DOMtoATK.h (5139B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include <glib.h> 8 #include "mozilla/a11y/HyperTextAccessibleBase.h" 9 #include "nsCharTraits.h" 10 #include "nsString.h" 11 12 /** 13 * ATK offsets are counted in unicode codepoints, while DOM offsets are counted 14 * in UTF-16 code units. That makes a difference for non-BMP characters, 15 * which need two UTF-16 code units to be represented (a pair of surrogates), 16 * while they are just one unicode character. 17 * 18 * To keep synchronization between ATK offsets (unicode codepoints) and DOM 19 * offsets (UTF-16 code units), after translation from UTF-16 to UTF-8 we add a 20 * BOM after each non-BMP character (which would otherwise use 2 UTF-16 21 * code units for only 1 unicode codepoint). 22 * 23 * BOMs (Byte Order Marks, U+FEFF, also known as ZERO WIDTH NO-BREAK SPACE, but 24 * that usage is deprecated) normally only appear at the beginning of unicode 25 * files, but their occurrence within text (notably after cut&paste) is not 26 * uncommon, and are thus considered as non-text. 27 * 28 * Since the selection requested through ATK may not contain both surrogates 29 * at the ends of the selection, we need to fetch one UTF-16 code point more 30 * on both side, and get rid of it before returning the string to ATK. The 31 * ATKStringConverterHelper class maintains this, NewATKString should be used 32 * to call it properly. 33 * 34 * In the end, 35 * - if the start is between the high and low surrogates, the UTF-8 result 36 * includes a BOM from it but not the character 37 * - if the end is between the high and low surrogates, the UTF-8 result 38 * includes the character but *not* the BOM 39 * - all non-BMP characters that are fully in the string are in the UTF-8 result 40 * as character followed by BOM 41 */ 42 namespace mozilla { 43 namespace a11y { 44 45 namespace DOMtoATK { 46 47 /** 48 * Converts a string of accessible text into ATK gchar* string (by adding 49 * BOMs). This can be used when offsets do not need to be adjusted because 50 * ends of the string can not fall between surrogates. 51 */ 52 gchar* Convert(const nsAString& aStr); 53 54 /** 55 * Add a BOM after each non-BMP character. 56 */ 57 void AddBOMs(nsACString& aDest, const nsACString& aSource); 58 59 class ATKStringConverterHelper { 60 public: 61 ATKStringConverterHelper(void) 62 : 63 #ifdef DEBUG 64 mAdjusted(false), 65 #endif 66 mStartShifted(false), 67 mEndShifted(false) { 68 } 69 70 /** 71 * In order to properly get non-BMP values, offsets need to be changed 72 * to get one character more on each end, so that ConvertUTF16toUTF8 can 73 * convert surrogates even if the originally requested offsets fall between 74 * them. 75 */ 76 void AdjustOffsets(gint* aStartOffset, gint* aEndOffset, gint count); 77 78 /** 79 * Converts a string of accessible text with adjusted offsets into ATK 80 * gchar* string (by adding BOMs). Note, AdjustOffsets has to be called 81 * before getting the text passed to this. 82 */ 83 gchar* ConvertAdjusted(const nsAString& aStr); 84 85 private: 86 /** 87 * Remove the additional characters requested by PrepareUTF16toUTF8. 88 */ 89 gchar* FinishUTF16toUTF8(nsCString& aStr); 90 91 #ifdef DEBUG 92 bool mAdjusted; 93 #endif 94 bool mStartShifted; 95 bool mEndShifted; 96 }; 97 98 /** 99 * Get text from aAccessible, using ATKStringConverterHelper to properly 100 * introduce appropriate BOMs. 101 */ 102 inline gchar* NewATKString(HyperTextAccessibleBase* aAccessible, 103 gint aStartOffset, gint aEndOffset) { 104 gint startOffset = aStartOffset, endOffset = aEndOffset; 105 ATKStringConverterHelper converter; 106 converter.AdjustOffsets(&startOffset, &endOffset, 107 gint(aAccessible->CharacterCount())); 108 nsAutoString str; 109 aAccessible->TextSubstring(startOffset, endOffset, str); 110 111 if (str.Length() == 0) { 112 // Bogus offsets, or empty string, either way we do not need conversion. 113 return g_strdup(""); 114 } 115 116 return converter.ConvertAdjusted(str); 117 } 118 119 /** 120 * Get a character from aAccessible, fetching more data as appropriate to 121 * properly get non-BMP characters or a BOM as appropriate. 122 */ 123 inline gunichar ATKCharacter(HyperTextAccessibleBase* aAccessible, 124 gint aOffset) { 125 // char16_t is unsigned short in Mozilla, gnuichar is guint32 in glib. 126 gunichar character = static_cast<gunichar>(aAccessible->CharAt(aOffset)); 127 128 if (NS_IS_LOW_SURROGATE(character)) { 129 // Trailing surrogate, return BOM instead. 130 return 0xFEFF; 131 } 132 133 if (NS_IS_HIGH_SURROGATE(character)) { 134 // Heading surrogate, get the trailing surrogate and combine them. 135 gunichar characterLow = 136 static_cast<gunichar>(aAccessible->CharAt(aOffset + 1)); 137 138 if (!NS_IS_LOW_SURROGATE(characterLow)) { 139 // It should have been a trailing surrogate... Flag the error. 140 return 0xFFFD; 141 } 142 return SURROGATE_TO_UCS4(character, characterLow); 143 } 144 145 return character; 146 } 147 148 } // namespace DOMtoATK 149 150 } // namespace a11y 151 } // namespace mozilla