nsScanner.cpp (9371B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=2 sw=2 et tw=78: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 // #define __INCREMENTAL 1 8 9 #include "nsScanner.h" 10 11 #include "mozilla/Encoding.h" 12 #include "mozilla/UniquePtr.h" 13 #include "nsDebug.h" 14 #include "nsReadableUtils.h" 15 #include "nsUTF8Utils.h" // for LossyConvertEncoding 16 #include "nsCRT.h" 17 #include "nsParser.h" 18 #include "nsCharsetSource.h" 19 20 nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars) 21 : mChars(aTerminateChars), 22 mFilter(char16_t(~0)) // All bits set 23 { 24 // Build filter that will be used to filter out characters with 25 // bits that none of the terminal chars have. This works very well 26 // because terminal chars often have only the last 4-6 bits set and 27 // normal ascii letters have bit 7 set. Other letters have even higher 28 // bits set. 29 30 // Calculate filter 31 const char16_t* current = aTerminateChars; 32 char16_t terminalChar = *current; 33 while (terminalChar) { 34 mFilter &= ~terminalChar; 35 ++current; 36 terminalChar = *current; 37 } 38 } 39 40 /** 41 * Use this constructor if you want i/o to be based on 42 * a single string you hand in during construction. 43 * This short cut was added for Javascript. 44 * 45 * @update gess 5/12/98 46 * @param aMode represents the parser mode (nav, other) 47 * @return 48 */ 49 nsScanner::nsScanner(const nsAString& anHTMLString, bool aIncremental) 50 : mIncremental(aIncremental) { 51 MOZ_COUNT_CTOR(nsScanner); 52 53 AppendToBuffer(anHTMLString); 54 MOZ_ASSERT(mMarkPosition == mCurrentPosition); 55 } 56 57 /** 58 * Use this constructor if you want i/o to be based on strings 59 * the scanner receives. If you pass a null filename, you 60 * can still provide data to the scanner via append. 61 */ 62 nsScanner::nsScanner(nsIURI* aURI) : mURI(aURI), mIncremental(true) { 63 MOZ_COUNT_CTOR(nsScanner); 64 65 // XXX This is a big hack. We need to initialize the iterators to something. 66 // What matters is that mCurrentPosition == mEndPosition, so that our methods 67 // believe that we are at EOF (see bug 182067). We null out mCurrentPosition 68 // so that we have some hope of catching null pointer dereferences associated 69 // with this hack. --darin 70 memset(&mCurrentPosition, 0, sizeof(mCurrentPosition)); 71 mMarkPosition = mCurrentPosition; 72 mEndPosition = mCurrentPosition; 73 74 // XML defaults to UTF-8 and about:blank is UTF-8, too. 75 SetDocumentCharset(UTF_8_ENCODING, kCharsetFromDocTypeDefault); 76 } 77 78 nsresult nsScanner::SetDocumentCharset(NotNull<const Encoding*> aEncoding, 79 int32_t aSource) { 80 if (aSource < mCharsetSource) // priority is lower than the current one 81 return NS_OK; 82 83 mCharsetSource = aSource; 84 nsCString charsetName; 85 aEncoding->Name(charsetName); 86 if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) { 87 return NS_OK; // no difference, don't change it 88 } 89 90 // different, need to change it 91 92 mCharset.Assign(charsetName); 93 94 mUnicodeDecoder = aEncoding->NewDecoderWithBOMRemoval(); 95 96 return NS_OK; 97 } 98 99 /** 100 * default destructor 101 * 102 * @update gess 3/25/98 103 * @param 104 * @return 105 */ 106 nsScanner::~nsScanner() { MOZ_COUNT_DTOR(nsScanner); } 107 108 /** 109 * Resets current offset position of input stream to marked position. 110 * This allows us to back up to this point if the need should arise, 111 * such as when tokenization gets interrupted. 112 * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST! 113 * 114 * @update gess 5/12/98 115 * @param 116 * @return 117 */ 118 void nsScanner::RewindToMark(void) { 119 if (mSlidingBuffer) { 120 mCurrentPosition = mMarkPosition; 121 } 122 } 123 124 /** 125 * Records current offset position in input stream. This allows us 126 * to back up to this point if the need should arise, such as when 127 * tokenization gets interrupted. 128 * 129 * @update gess 7/29/98 130 * @param 131 * @return 132 */ 133 int32_t nsScanner::Mark() { 134 int32_t distance = 0; 135 if (mSlidingBuffer) { 136 nsScannerIterator oldStart; 137 mSlidingBuffer->BeginReading(oldStart); 138 139 distance = Distance(oldStart, mCurrentPosition); 140 141 mSlidingBuffer->DiscardPrefix(mCurrentPosition); 142 mSlidingBuffer->BeginReading(mCurrentPosition); 143 mMarkPosition = mCurrentPosition; 144 } 145 146 return distance; 147 } 148 149 /** 150 * Insert data to our underlying input buffer as 151 * if it were read from an input stream. 152 * 153 * @update harishd 01/12/99 154 * @return error code 155 */ 156 bool nsScanner::UngetReadable(const nsAString& aBuffer) { 157 if (!mSlidingBuffer) { 158 return false; 159 } 160 161 mSlidingBuffer->UngetReadable(aBuffer, mCurrentPosition); 162 mSlidingBuffer->BeginReading( 163 mCurrentPosition); // Insertion invalidated our iterators 164 mSlidingBuffer->EndReading(mEndPosition); 165 166 return true; 167 } 168 169 /** 170 * Append data to our underlying input buffer as 171 * if it were read from an input stream. 172 * 173 * @update gess4/3/98 174 * @return error code 175 */ 176 nsresult nsScanner::Append(const nsAString& aBuffer) { 177 if (!AppendToBuffer(aBuffer)) return NS_ERROR_OUT_OF_MEMORY; 178 return NS_OK; 179 } 180 181 /** 182 * 183 * 184 * @update gess 5/21/98 185 * @param 186 * @return 187 */ 188 nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen) { 189 nsresult res = NS_OK; 190 if (mUnicodeDecoder) { 191 mozilla::CheckedInt<size_t> needed = 192 mUnicodeDecoder->MaxUTF16BufferLength(aLen); 193 if (!needed.isValid()) { 194 return NS_ERROR_OUT_OF_MEMORY; 195 } 196 mozilla::CheckedInt<uint32_t> allocLen( 197 1); // null terminator due to legacy sadness 198 allocLen += needed.value(); 199 if (!allocLen.isValid()) { 200 return NS_ERROR_OUT_OF_MEMORY; 201 } 202 nsScannerString::Buffer* buffer = 203 nsScannerString::AllocBuffer(allocLen.value()); 204 NS_ENSURE_TRUE(buffer, NS_ERROR_OUT_OF_MEMORY); 205 char16_t* unichars = buffer->DataStart(); 206 207 uint32_t result; 208 size_t read; 209 size_t written; 210 // Do not use structured binding lest deal with [-Werror=unused-variable] 211 std::tie(result, read, written) = 212 mUnicodeDecoder->DecodeToUTF16WithoutReplacement( 213 AsBytes(mozilla::Span(aBuffer, aLen)), 214 mozilla::Span(unichars, needed.value()), 215 false); // Retain bug about failure to handle EOF 216 MOZ_ASSERT(result != mozilla::kOutputFull); 217 MOZ_ASSERT(read <= aLen); 218 MOZ_ASSERT(written <= needed.value()); 219 if (result != mozilla::kInputEmpty) { 220 // Since about:blank is empty, this line runs only for XML. Use a 221 // character that's illegal in XML instead of U+FFFD in order to make 222 // expat flag the error. There is no need to loop and convert more, since 223 // expat will stop here anyway. 224 unichars[written++] = 0xFFFF; 225 } 226 buffer->SetDataLength(written); 227 // Don't propagate return code of unicode decoder 228 // since it doesn't reflect on our success or failure 229 // - Ref. bug 87110 230 res = NS_OK; 231 AppendToBuffer(buffer); 232 } else { 233 NS_WARNING("No decoder found."); 234 res = NS_ERROR_FAILURE; 235 } 236 237 return res; 238 } 239 240 /** 241 * retrieve next char from scanners internal input stream 242 * 243 * @update gess 3/25/98 244 * @param 245 * @return error code reflecting read status 246 */ 247 nsresult nsScanner::GetChar(char16_t& aChar) { 248 if (!mSlidingBuffer || mCurrentPosition == mEndPosition) { 249 aChar = 0; 250 return NS_ERROR_HTMLPARSER_EOF; 251 } 252 253 aChar = *mCurrentPosition++; 254 255 return NS_OK; 256 } 257 258 void nsScanner::BindSubstring(nsScannerSubstring& aSubstring, 259 const nsScannerIterator& aStart, 260 const nsScannerIterator& aEnd) { 261 aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd); 262 } 263 264 void nsScanner::CurrentPosition(nsScannerIterator& aPosition) { 265 aPosition = mCurrentPosition; 266 } 267 268 void nsScanner::EndReading(nsScannerIterator& aPosition) { 269 aPosition = mEndPosition; 270 } 271 272 void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate) { 273 if (mSlidingBuffer) { 274 mCurrentPosition = aPosition; 275 if (aTerminate && (mCurrentPosition == mEndPosition)) { 276 mMarkPosition = mCurrentPosition; 277 mSlidingBuffer->DiscardPrefix(mCurrentPosition); 278 } 279 } 280 } 281 282 void nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf) { 283 if (!mSlidingBuffer) { 284 mSlidingBuffer = mozilla::MakeUnique<nsScannerString>(aBuf); 285 mSlidingBuffer->BeginReading(mCurrentPosition); 286 mMarkPosition = mCurrentPosition; 287 } else { 288 mSlidingBuffer->AppendBuffer(aBuf); 289 if (mCurrentPosition == mEndPosition) { 290 mSlidingBuffer->BeginReading(mCurrentPosition); 291 } 292 } 293 mSlidingBuffer->EndReading(mEndPosition); 294 } 295 296 /** 297 * call this to copy bytes out of the scanner that have not yet been consumed 298 * by the tokenization process. 299 * 300 * @update gess 5/12/98 301 * @param aCopyBuffer is where the scanner buffer will be copied to 302 * @return true if OK or false on OOM 303 */ 304 bool nsScanner::CopyUnusedData(nsString& aCopyBuffer) { 305 if (!mSlidingBuffer) { 306 aCopyBuffer.Truncate(); 307 return true; 308 } 309 310 nsScannerIterator start, end; 311 start = mCurrentPosition; 312 end = mEndPosition; 313 314 return CopyUnicodeTo(start, end, aCopyBuffer); 315 } 316 317 /** 318 * Conduct self test. Actually, selftesting for this class 319 * occurs in the parser selftest. 320 * 321 * @update gess 3/25/98 322 * @param 323 * @return 324 */ 325 326 void nsScanner::SelfTest(void) { 327 #ifdef _DEBUG 328 #endif 329 }