nsHtml5Tokenizer.cpp (34760B)
1 /* 2 * Copyright (c) 2005-2007 Henri Sivonen 3 * Copyright (c) 2007-2017 Mozilla Foundation 4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 5 * Foundation, and Opera Software ASA. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 * DEALINGS IN THE SOFTWARE. 24 */ 25 26 /* 27 * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. 28 * Please edit Tokenizer.java instead and regenerate. 29 */ 30 31 #define nsHtml5Tokenizer_cpp__ 32 33 #include "nsHtml5AttributeName.h" 34 #include "nsHtml5ElementName.h" 35 #include "nsHtml5TreeBuilder.h" 36 #include "nsHtml5StackNode.h" 37 #include "nsHtml5UTF16Buffer.h" 38 #include "nsHtml5StateSnapshot.h" 39 #include "nsHtml5Portability.h" 40 41 #include "nsHtml5Tokenizer.h" 42 43 char16_t nsHtml5Tokenizer::LT_GT[] = {'<', '>'}; 44 char16_t nsHtml5Tokenizer::LT_SOLIDUS[] = {'<', '/'}; 45 char16_t nsHtml5Tokenizer::RSQB_RSQB[] = {']', ']'}; 46 char16_t nsHtml5Tokenizer::REPLACEMENT_CHARACTER[] = {0xfffd}; 47 char16_t nsHtml5Tokenizer::LF[] = {'\n'}; 48 char16_t nsHtml5Tokenizer::CDATA_LSQB[] = {'C', 'D', 'A', 'T', 'A', '['}; 49 char16_t nsHtml5Tokenizer::OCTYPE[] = {'o', 'c', 't', 'y', 'p', 'e'}; 50 char16_t nsHtml5Tokenizer::UBLIC[] = {'u', 'b', 'l', 'i', 'c'}; 51 char16_t nsHtml5Tokenizer::YSTEM[] = {'y', 's', 't', 'e', 'm'}; 52 static char16_t const TITLE_ARR_DATA[] = {'t', 'i', 't', 'l', 'e'}; 53 staticJArray<char16_t, int32_t> nsHtml5Tokenizer::TITLE_ARR = { 54 TITLE_ARR_DATA, std::size(TITLE_ARR_DATA)}; 55 static char16_t const SCRIPT_ARR_DATA[] = {'s', 'c', 'r', 'i', 'p', 't'}; 56 staticJArray<char16_t, int32_t> nsHtml5Tokenizer::SCRIPT_ARR = { 57 SCRIPT_ARR_DATA, std::size(SCRIPT_ARR_DATA)}; 58 static char16_t const STYLE_ARR_DATA[] = {'s', 't', 'y', 'l', 'e'}; 59 staticJArray<char16_t, int32_t> nsHtml5Tokenizer::STYLE_ARR = { 60 STYLE_ARR_DATA, std::size(STYLE_ARR_DATA)}; 61 static char16_t const PLAINTEXT_ARR_DATA[] = {'p', 'l', 'a', 'i', 'n', 62 't', 'e', 'x', 't'}; 63 staticJArray<char16_t, int32_t> nsHtml5Tokenizer::PLAINTEXT_ARR = { 64 PLAINTEXT_ARR_DATA, std::size(PLAINTEXT_ARR_DATA)}; 65 static char16_t const XMP_ARR_DATA[] = {'x', 'm', 'p'}; 66 staticJArray<char16_t, int32_t> nsHtml5Tokenizer::XMP_ARR = { 67 XMP_ARR_DATA, std::size(XMP_ARR_DATA)}; 68 static char16_t const TEXTAREA_ARR_DATA[] = {'t', 'e', 'x', 't', 69 'a', 'r', 'e', 'a'}; 70 staticJArray<char16_t, int32_t> nsHtml5Tokenizer::TEXTAREA_ARR = { 71 TEXTAREA_ARR_DATA, std::size(TEXTAREA_ARR_DATA)}; 72 static char16_t const IFRAME_ARR_DATA[] = {'i', 'f', 'r', 'a', 'm', 'e'}; 73 staticJArray<char16_t, int32_t> nsHtml5Tokenizer::IFRAME_ARR = { 74 IFRAME_ARR_DATA, std::size(IFRAME_ARR_DATA)}; 75 static char16_t const NOEMBED_ARR_DATA[] = {'n', 'o', 'e', 'm', 'b', 'e', 'd'}; 76 staticJArray<char16_t, int32_t> nsHtml5Tokenizer::NOEMBED_ARR = { 77 NOEMBED_ARR_DATA, std::size(NOEMBED_ARR_DATA)}; 78 static char16_t const NOSCRIPT_ARR_DATA[] = {'n', 'o', 's', 'c', 79 'r', 'i', 'p', 't'}; 80 staticJArray<char16_t, int32_t> nsHtml5Tokenizer::NOSCRIPT_ARR = { 81 NOSCRIPT_ARR_DATA, std::size(NOSCRIPT_ARR_DATA)}; 82 static char16_t const NOFRAMES_ARR_DATA[] = {'n', 'o', 'f', 'r', 83 'a', 'm', 'e', 's'}; 84 staticJArray<char16_t, int32_t> nsHtml5Tokenizer::NOFRAMES_ARR = { 85 NOFRAMES_ARR_DATA, std::size(NOFRAMES_ARR_DATA)}; 86 87 nsHtml5Tokenizer::nsHtml5Tokenizer(nsHtml5TreeBuilder* tokenHandler, 88 bool viewingXmlSource) 89 : tokenHandler(tokenHandler), 90 encodingDeclarationHandler(nullptr), 91 lastCR(false), 92 stateSave(0), 93 returnStateSave(0), 94 index(0), 95 forceQuirks(false), 96 additional('\0'), 97 entCol(0), 98 firstCharKey(0), 99 lo(0), 100 hi(0), 101 candidate(0), 102 charRefBufMark(0), 103 value(0), 104 seenDigits(false), 105 suspendAfterCurrentNonTextToken(false), 106 cstart(0), 107 strBufLen(0), 108 charRefBuf(jArray<char16_t, int32_t>::newJArray(32)), 109 charRefBufLen(0), 110 bmpChar(jArray<char16_t, int32_t>::newJArray(1)), 111 astralChar(jArray<char16_t, int32_t>::newJArray(2)), 112 endTagExpectation(nullptr), 113 endTagExpectationAsArray(nullptr), 114 endTag(false), 115 containsHyphen(false), 116 tagName(nullptr), 117 nonInternedTagName(new nsHtml5ElementName()), 118 attributeName(nullptr), 119 nonInternedAttributeName(new nsHtml5AttributeName()), 120 doctypeName(nullptr), 121 publicIdentifier(nullptr), 122 systemIdentifier(nullptr), 123 attributes(tokenHandler->HasBuilder() ? new nsHtml5HtmlAttributes(0) 124 : nullptr), 125 newAttributesEachTime(!tokenHandler->HasBuilder()), 126 shouldSuspend(false), 127 keepBuffer(false), 128 confident(false), 129 line(0), 130 attributeLine(0), 131 interner(nullptr), 132 viewingXmlSource(viewingXmlSource) { 133 MOZ_COUNT_CTOR(nsHtml5Tokenizer); 134 } 135 136 void nsHtml5Tokenizer::setInterner(nsHtml5AtomTable* interner) { 137 this->interner = interner; 138 } 139 140 void nsHtml5Tokenizer::initLocation(nsHtml5String newPublicId, 141 nsHtml5String newSystemId) { 142 this->systemId = newSystemId; 143 this->publicId = newPublicId; 144 } 145 146 bool nsHtml5Tokenizer::isViewingXmlSource() { return viewingXmlSource; } 147 148 void nsHtml5Tokenizer::setKeepBuffer(bool keepBuffer) { 149 this->keepBuffer = keepBuffer; 150 } 151 152 bool nsHtml5Tokenizer::dropBufferIfLongerThan(int32_t length) { 153 if (strBuf.length > length) { 154 strBuf = nullptr; 155 return true; 156 } 157 return false; 158 } 159 160 void nsHtml5Tokenizer::setState(int32_t specialTokenizerState) { 161 this->stateSave = specialTokenizerState; 162 this->endTagExpectation = nullptr; 163 this->endTagExpectationAsArray = nullptr; 164 } 165 166 void nsHtml5Tokenizer::setStateAndEndTagExpectation( 167 int32_t specialTokenizerState, nsHtml5ElementName* endTagExpectation) { 168 this->stateSave = specialTokenizerState; 169 this->endTagExpectation = endTagExpectation; 170 endTagExpectationToArray(); 171 } 172 173 void nsHtml5Tokenizer::endTagExpectationToArray() { 174 switch (endTagExpectation->getGroup()) { 175 case nsHtml5TreeBuilder::TITLE: { 176 endTagExpectationAsArray = TITLE_ARR; 177 return; 178 } 179 case nsHtml5TreeBuilder::SCRIPT: { 180 endTagExpectationAsArray = SCRIPT_ARR; 181 return; 182 } 183 case nsHtml5TreeBuilder::STYLE: { 184 endTagExpectationAsArray = STYLE_ARR; 185 return; 186 } 187 case nsHtml5TreeBuilder::PLAINTEXT: { 188 endTagExpectationAsArray = PLAINTEXT_ARR; 189 return; 190 } 191 case nsHtml5TreeBuilder::XMP: { 192 endTagExpectationAsArray = XMP_ARR; 193 return; 194 } 195 case nsHtml5TreeBuilder::TEXTAREA: { 196 endTagExpectationAsArray = TEXTAREA_ARR; 197 return; 198 } 199 case nsHtml5TreeBuilder::IFRAME: { 200 endTagExpectationAsArray = IFRAME_ARR; 201 return; 202 } 203 case nsHtml5TreeBuilder::NOEMBED: { 204 endTagExpectationAsArray = NOEMBED_ARR; 205 return; 206 } 207 case nsHtml5TreeBuilder::NOSCRIPT: { 208 endTagExpectationAsArray = NOSCRIPT_ARR; 209 return; 210 } 211 case nsHtml5TreeBuilder::NOFRAMES: { 212 endTagExpectationAsArray = NOFRAMES_ARR; 213 return; 214 } 215 default: { 216 MOZ_ASSERT(false, "Bad end tag expectation."); 217 return; 218 } 219 } 220 } 221 222 void nsHtml5Tokenizer::setLineNumber(int32_t line) { 223 this->attributeLine = line; 224 this->line = line; 225 } 226 227 void nsHtml5Tokenizer::appendCharRefBuf(char16_t c) { 228 MOZ_RELEASE_ASSERT(charRefBufLen < charRefBuf.length, 229 "Attempted to overrun charRefBuf!"); 230 charRefBuf[charRefBufLen++] = c; 231 } 232 233 void nsHtml5Tokenizer::emitOrAppendCharRefBuf(int32_t returnState) { 234 if ((returnState & DATA_AND_RCDATA_MASK)) { 235 appendCharRefBufToStrBuf(); 236 } else { 237 if (charRefBufLen > 0) { 238 tokenHandler->characters(charRefBuf, 0, charRefBufLen); 239 charRefBufLen = 0; 240 } 241 } 242 } 243 244 void nsHtml5Tokenizer::emitComment(int32_t provisionalHyphens, int32_t pos) { 245 RememberGt(pos); 246 tokenHandler->comment(strBuf, 0, strBufLen - provisionalHyphens); 247 clearStrBufAfterUse(); 248 cstart = pos + 1; 249 suspendIfRequestedAfterCurrentNonTextToken(); 250 } 251 252 void nsHtml5Tokenizer::flushChars(char16_t* buf, int32_t pos) { 253 if (pos > cstart) { 254 tokenHandler->characters(buf, cstart, pos - cstart); 255 } 256 cstart = INT32_MAX; 257 } 258 259 void nsHtml5Tokenizer::strBufToElementNameString() { 260 if (containsHyphen) { 261 nsAtom* annotationName = nsHtml5ElementName::ELT_ANNOTATION_XML->getName(); 262 if (nsHtml5Portability::localEqualsBuffer(annotationName, strBuf, 263 strBufLen)) { 264 tagName = nsHtml5ElementName::ELT_ANNOTATION_XML; 265 } else { 266 nonInternedTagName->setNameForNonInterned( 267 nsHtml5Portability::newLocalNameFromBuffer(strBuf, strBufLen, 268 interner), 269 true); 270 tagName = nonInternedTagName; 271 } 272 } else { 273 tagName = nsHtml5ElementName::elementNameByBuffer(strBuf, strBufLen); 274 if (!tagName) { 275 nonInternedTagName->setNameForNonInterned( 276 nsHtml5Portability::newLocalNameFromBuffer(strBuf, strBufLen, 277 interner), 278 false); 279 tagName = nonInternedTagName; 280 } 281 } 282 containsHyphen = false; 283 clearStrBufAfterUse(); 284 } 285 286 int32_t nsHtml5Tokenizer::emitCurrentTagToken(bool selfClosing, int32_t pos) { 287 RememberGt(pos); 288 cstart = pos + 1; 289 maybeErrSlashInEndTag(selfClosing); 290 stateSave = nsHtml5Tokenizer::DATA; 291 nsHtml5HtmlAttributes* attrs = 292 (!attributes ? nsHtml5HtmlAttributes::EMPTY_ATTRIBUTES : attributes); 293 if (endTag) { 294 maybeErrAttributesOnEndTag(attrs); 295 if (!viewingXmlSource) { 296 tokenHandler->endTag(tagName); 297 } 298 if (newAttributesEachTime) { 299 delete attributes; 300 attributes = nullptr; 301 } 302 } else { 303 if (viewingXmlSource) { 304 MOZ_ASSERT(newAttributesEachTime); 305 delete attributes; 306 attributes = nullptr; 307 } else { 308 tokenHandler->startTag(tagName, attrs, selfClosing); 309 } 310 } 311 tagName = nullptr; 312 if (newAttributesEachTime) { 313 attributes = nullptr; 314 } else { 315 attributes->clear(0); 316 } 317 suspendIfRequestedAfterCurrentNonTextToken(); 318 return stateSave; 319 } 320 321 void nsHtml5Tokenizer::attributeNameComplete() { 322 attributeName = 323 nsHtml5AttributeName::nameByBuffer(strBuf, strBufLen, interner); 324 if (!attributeName) { 325 nonInternedAttributeName->setNameForNonInterned( 326 nsHtml5Portability::newLocalNameFromBuffer(strBuf, strBufLen, 327 interner)); 328 attributeName = nonInternedAttributeName; 329 } 330 clearStrBufAfterUse(); 331 if (!attributes) { 332 attributes = new nsHtml5HtmlAttributes(0); 333 } 334 if (attributes->contains(attributeName)) { 335 errDuplicateAttribute(); 336 attributeName = nullptr; 337 } 338 } 339 340 void nsHtml5Tokenizer::addAttributeWithoutValue() { 341 if (attributeName) { 342 attributes->addAttribute( 343 attributeName, nsHtml5Portability::newEmptyString(), attributeLine); 344 attributeName = nullptr; 345 } else { 346 clearStrBufAfterUse(); 347 } 348 } 349 350 void nsHtml5Tokenizer::addAttributeWithValue() { 351 if (attributeName) { 352 nsHtml5String val = strBufToString(); 353 if (mViewSource) { 354 mViewSource->MaybeLinkifyAttributeValue(attributeName, val); 355 } 356 attributes->addAttribute(attributeName, val, attributeLine); 357 attributeName = nullptr; 358 } else { 359 clearStrBufAfterUse(); 360 } 361 } 362 363 void nsHtml5Tokenizer::start() { 364 initializeWithoutStarting(); 365 tokenHandler->startTokenization(this); 366 if (mViewSource) { 367 line = 1; 368 col = -1; 369 nextCharOnNewLine = false; 370 } else if (tokenHandler->WantsLineAndColumn()) { 371 line = 0; 372 col = 1; 373 nextCharOnNewLine = true; 374 } else { 375 line = -1; 376 col = -1; 377 nextCharOnNewLine = false; 378 } 379 } 380 381 bool nsHtml5Tokenizer::tokenizeBuffer(nsHtml5UTF16Buffer* buffer) { 382 int32_t state = stateSave; 383 int32_t returnState = returnStateSave; 384 char16_t c = '\0'; 385 shouldSuspend = false; 386 lastCR = false; 387 int32_t start = buffer->getStart(); 388 int32_t end = buffer->getEnd(); 389 int32_t pos = start - 1; 390 switch (state) { 391 case DATA: 392 case RCDATA: 393 case SCRIPT_DATA: 394 case PLAINTEXT: 395 case RAWTEXT: 396 case CDATA_SECTION: 397 case SCRIPT_DATA_ESCAPED: 398 case SCRIPT_DATA_ESCAPE_START: 399 case SCRIPT_DATA_ESCAPE_START_DASH: 400 case SCRIPT_DATA_ESCAPED_DASH: 401 case SCRIPT_DATA_ESCAPED_DASH_DASH: 402 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 403 case SCRIPT_DATA_DOUBLE_ESCAPED: 404 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 405 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 406 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 407 case SCRIPT_DATA_DOUBLE_ESCAPE_END: { 408 cstart = start; 409 break; 410 } 411 default: { 412 cstart = INT32_MAX; 413 break; 414 } 415 } 416 if (mViewSource) { 417 mViewSource->SetBuffer(buffer); 418 if (mozilla::htmlaccel::htmlaccelEnabled()) { 419 pos = StateLoopViewSourceSIMD(state, c, pos, buffer->getBuffer(), false, 420 returnState, buffer->getEnd()); 421 } else { 422 pos = StateLoopViewSourceALU(state, c, pos, buffer->getBuffer(), false, 423 returnState, buffer->getEnd()); 424 } 425 mViewSource->DropBuffer((pos == buffer->getEnd()) ? pos : pos + 1); 426 } else if (tokenHandler->WantsLineAndColumn()) { 427 if (mozilla::htmlaccel::htmlaccelEnabled()) { 428 pos = StateLoopLineColSIMD(state, c, pos, buffer->getBuffer(), false, 429 returnState, buffer->getEnd()); 430 } else { 431 pos = StateLoopLineColALU(state, c, pos, buffer->getBuffer(), false, 432 returnState, buffer->getEnd()); 433 } 434 } else if (mozilla::htmlaccel::htmlaccelEnabled()) { 435 pos = StateLoopFastestSIMD(state, c, pos, buffer->getBuffer(), false, 436 returnState, buffer->getEnd()); 437 } else { 438 pos = StateLoopFastestALU(state, c, pos, buffer->getBuffer(), false, 439 returnState, buffer->getEnd()); 440 } 441 if (pos == end) { 442 buffer->setStart(pos); 443 } else { 444 buffer->setStart(pos + 1); 445 } 446 return lastCR; 447 } 448 449 void nsHtml5Tokenizer::initDoctypeFields() { 450 clearStrBufAfterUse(); 451 doctypeName = nullptr; 452 if (systemIdentifier) { 453 systemIdentifier.Release(); 454 systemIdentifier = nullptr; 455 } 456 if (publicIdentifier) { 457 publicIdentifier.Release(); 458 publicIdentifier = nullptr; 459 } 460 forceQuirks = false; 461 } 462 463 void nsHtml5Tokenizer::emitReplacementCharacter(char16_t* buf, int32_t pos) { 464 flushChars(buf, pos); 465 tokenHandler->zeroOriginatingReplacementCharacter(); 466 cstart = pos + 1; 467 } 468 469 void nsHtml5Tokenizer::maybeEmitReplacementCharacter(char16_t* buf, 470 int32_t pos) { 471 flushChars(buf, pos); 472 tokenHandler->zeroOrReplacementCharacter(); 473 cstart = pos + 1; 474 } 475 476 void nsHtml5Tokenizer::emitPlaintextReplacementCharacter(char16_t* buf, 477 int32_t pos) { 478 flushChars(buf, pos); 479 tokenHandler->characters(REPLACEMENT_CHARACTER, 0, 1); 480 cstart = pos + 1; 481 } 482 483 void nsHtml5Tokenizer::bogusDoctype() { 484 errBogusDoctype(); 485 forceQuirks = true; 486 } 487 488 void nsHtml5Tokenizer::bogusDoctypeWithoutQuirks() { 489 errBogusDoctype(); 490 forceQuirks = false; 491 } 492 493 void nsHtml5Tokenizer::handleNcrValue(int32_t returnState) { 494 if (value <= 0xFFFF) { 495 if (value >= 0x80 && value <= 0x9f) { 496 errNcrInC1Range(); 497 char16_t* val = nsHtml5NamedCharacters::WINDOWS_1252[value - 0x80]; 498 emitOrAppendOne(val, returnState); 499 } else if (value == 0x0) { 500 errNcrZero(); 501 emitOrAppendOne(nsHtml5Tokenizer::REPLACEMENT_CHARACTER, returnState); 502 } else if ((value & 0xF800) == 0xD800) { 503 errNcrSurrogate(); 504 emitOrAppendOne(nsHtml5Tokenizer::REPLACEMENT_CHARACTER, returnState); 505 } else { 506 char16_t ch = (char16_t)value; 507 bmpChar[0] = ch; 508 emitOrAppendOne(bmpChar, returnState); 509 } 510 } else if (value <= 0x10FFFF) { 511 astralChar[0] = (char16_t)(nsHtml5Tokenizer::LEAD_OFFSET + (value >> 10)); 512 astralChar[1] = (char16_t)(0xDC00 + (value & 0x3FF)); 513 emitOrAppendTwo(astralChar, returnState); 514 } else { 515 errNcrOutOfRange(); 516 emitOrAppendOne(nsHtml5Tokenizer::REPLACEMENT_CHARACTER, returnState); 517 } 518 } 519 520 void nsHtml5Tokenizer::eof() { 521 int32_t state = stateSave; 522 int32_t returnState = returnStateSave; 523 eofloop: 524 for (;;) { 525 switch (state) { 526 case SCRIPT_DATA_LESS_THAN_SIGN: 527 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: { 528 tokenHandler->characters(nsHtml5Tokenizer::LT_GT, 0, 1); 529 NS_HTML5_BREAK(eofloop); 530 } 531 case TAG_OPEN: { 532 errEofAfterLt(); 533 tokenHandler->characters(nsHtml5Tokenizer::LT_GT, 0, 1); 534 NS_HTML5_BREAK(eofloop); 535 } 536 case RAWTEXT_RCDATA_LESS_THAN_SIGN: { 537 tokenHandler->characters(nsHtml5Tokenizer::LT_GT, 0, 1); 538 NS_HTML5_BREAK(eofloop); 539 } 540 case NON_DATA_END_TAG_NAME: { 541 tokenHandler->characters(nsHtml5Tokenizer::LT_SOLIDUS, 0, 2); 542 emitStrBuf(); 543 NS_HTML5_BREAK(eofloop); 544 } 545 case CLOSE_TAG_OPEN: { 546 errEofAfterLt(); 547 tokenHandler->characters(nsHtml5Tokenizer::LT_SOLIDUS, 0, 2); 548 NS_HTML5_BREAK(eofloop); 549 } 550 case TAG_NAME: { 551 errEofInTagName(); 552 NS_HTML5_BREAK(eofloop); 553 } 554 case BEFORE_ATTRIBUTE_NAME: 555 case AFTER_ATTRIBUTE_VALUE_QUOTED: 556 case SELF_CLOSING_START_TAG: { 557 errEofWithoutGt(); 558 NS_HTML5_BREAK(eofloop); 559 } 560 case ATTRIBUTE_NAME: { 561 errEofInAttributeName(); 562 NS_HTML5_BREAK(eofloop); 563 } 564 case AFTER_ATTRIBUTE_NAME: 565 case BEFORE_ATTRIBUTE_VALUE: { 566 errEofWithoutGt(); 567 NS_HTML5_BREAK(eofloop); 568 } 569 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 570 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 571 case ATTRIBUTE_VALUE_UNQUOTED: { 572 errEofInAttributeValue(); 573 NS_HTML5_BREAK(eofloop); 574 } 575 case BOGUS_COMMENT: { 576 emitComment(0, 0); 577 NS_HTML5_BREAK(eofloop); 578 } 579 case BOGUS_COMMENT_HYPHEN: { 580 emitComment(0, 0); 581 NS_HTML5_BREAK(eofloop); 582 } 583 case MARKUP_DECLARATION_OPEN: { 584 errBogusComment(); 585 emitComment(0, 0); 586 NS_HTML5_BREAK(eofloop); 587 } 588 case MARKUP_DECLARATION_HYPHEN: { 589 errBogusComment(); 590 emitComment(0, 0); 591 NS_HTML5_BREAK(eofloop); 592 } 593 case MARKUP_DECLARATION_OCTYPE: { 594 if (index < 6) { 595 errBogusComment(); 596 emitComment(0, 0); 597 } else { 598 errEofInDoctype(); 599 doctypeName = nullptr; 600 if (systemIdentifier) { 601 systemIdentifier.Release(); 602 systemIdentifier = nullptr; 603 } 604 if (publicIdentifier) { 605 publicIdentifier.Release(); 606 publicIdentifier = nullptr; 607 } 608 forceQuirks = true; 609 emitDoctypeToken(0); 610 NS_HTML5_BREAK(eofloop); 611 } 612 NS_HTML5_BREAK(eofloop); 613 } 614 case COMMENT_START: 615 case COMMENT: 616 case COMMENT_LESSTHAN: 617 case COMMENT_LESSTHAN_BANG: { 618 errEofInComment(); 619 emitComment(0, 0); 620 NS_HTML5_BREAK(eofloop); 621 } 622 case COMMENT_END: 623 case COMMENT_LESSTHAN_BANG_DASH_DASH: { 624 errEofInComment(); 625 emitComment(2, 0); 626 NS_HTML5_BREAK(eofloop); 627 } 628 case COMMENT_END_DASH: 629 case COMMENT_START_DASH: 630 case COMMENT_LESSTHAN_BANG_DASH: { 631 errEofInComment(); 632 emitComment(1, 0); 633 NS_HTML5_BREAK(eofloop); 634 } 635 case COMMENT_END_BANG: { 636 errEofInComment(); 637 emitComment(3, 0); 638 NS_HTML5_BREAK(eofloop); 639 } 640 case DOCTYPE: 641 case BEFORE_DOCTYPE_NAME: { 642 errEofInDoctype(); 643 forceQuirks = true; 644 emitDoctypeToken(0); 645 NS_HTML5_BREAK(eofloop); 646 } 647 case DOCTYPE_NAME: { 648 errEofInDoctype(); 649 strBufToDoctypeName(); 650 forceQuirks = true; 651 emitDoctypeToken(0); 652 NS_HTML5_BREAK(eofloop); 653 } 654 case DOCTYPE_UBLIC: 655 case DOCTYPE_YSTEM: 656 case AFTER_DOCTYPE_NAME: 657 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 658 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 659 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: { 660 errEofInDoctype(); 661 forceQuirks = true; 662 emitDoctypeToken(0); 663 NS_HTML5_BREAK(eofloop); 664 } 665 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 666 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: { 667 errEofInPublicId(); 668 forceQuirks = true; 669 publicIdentifier = strBufToString(); 670 emitDoctypeToken(0); 671 NS_HTML5_BREAK(eofloop); 672 } 673 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 674 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 675 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: { 676 errEofInDoctype(); 677 forceQuirks = true; 678 emitDoctypeToken(0); 679 NS_HTML5_BREAK(eofloop); 680 } 681 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 682 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: { 683 errEofInSystemId(); 684 forceQuirks = true; 685 systemIdentifier = strBufToString(); 686 emitDoctypeToken(0); 687 NS_HTML5_BREAK(eofloop); 688 } 689 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: { 690 errEofInDoctype(); 691 forceQuirks = true; 692 emitDoctypeToken(0); 693 NS_HTML5_BREAK(eofloop); 694 } 695 case BOGUS_DOCTYPE: { 696 emitDoctypeToken(0); 697 NS_HTML5_BREAK(eofloop); 698 } 699 case CONSUME_CHARACTER_REFERENCE: { 700 emitOrAppendCharRefBuf(returnState); 701 state = returnState; 702 continue; 703 } 704 case CHARACTER_REFERENCE_HILO_LOOKUP: { 705 emitOrAppendCharRefBuf(returnState); 706 state = returnState; 707 continue; 708 } 709 case CHARACTER_REFERENCE_TAIL: { 710 for (;;) { 711 char16_t c = '\0'; 712 entCol++; 713 for (;;) { 714 if (hi == -1) { 715 NS_HTML5_BREAK(hiloop); 716 } 717 if (entCol == nsHtml5NamedCharacters::NAMES[hi].length()) { 718 NS_HTML5_BREAK(hiloop); 719 } 720 if (entCol > nsHtml5NamedCharacters::NAMES[hi].length()) { 721 NS_HTML5_BREAK(outer); 722 } else if (c < nsHtml5NamedCharacters::NAMES[hi].charAt(entCol)) { 723 hi--; 724 } else { 725 NS_HTML5_BREAK(hiloop); 726 } 727 } 728 hiloop_end:; 729 for (;;) { 730 if (hi < lo) { 731 NS_HTML5_BREAK(outer); 732 } 733 if (entCol == nsHtml5NamedCharacters::NAMES[lo].length()) { 734 candidate = lo; 735 charRefBufMark = charRefBufLen; 736 lo++; 737 } else if (entCol > nsHtml5NamedCharacters::NAMES[lo].length()) { 738 NS_HTML5_BREAK(outer); 739 } else if (c > nsHtml5NamedCharacters::NAMES[lo].charAt(entCol)) { 740 lo++; 741 } else { 742 NS_HTML5_BREAK(loloop); 743 } 744 } 745 loloop_end:; 746 if (hi < lo) { 747 NS_HTML5_BREAK(outer); 748 } 749 continue; 750 } 751 outer_end:; 752 if (candidate == -1) { 753 emitOrAppendCharRefBuf(returnState); 754 state = returnState; 755 NS_HTML5_CONTINUE(eofloop); 756 } else { 757 const nsHtml5CharacterName& candidateName = 758 nsHtml5NamedCharacters::NAMES[candidate]; 759 if (!candidateName.length() || 760 candidateName.charAt(candidateName.length() - 1) != ';') { 761 if ((returnState & DATA_AND_RCDATA_MASK)) { 762 char16_t ch; 763 if (charRefBufMark == charRefBufLen) { 764 ch = '\0'; 765 } else { 766 ch = charRefBuf[charRefBufMark]; 767 } 768 if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || 769 (ch >= 'a' && ch <= 'z')) { 770 appendCharRefBufToStrBuf(); 771 state = returnState; 772 NS_HTML5_CONTINUE(eofloop); 773 } 774 } 775 if ((returnState & DATA_AND_RCDATA_MASK)) { 776 errUnescapedAmpersandInterpretedAsCharacterReference(); 777 } else { 778 errNotSemicolonTerminated(); 779 } 780 } 781 const char16_t* val = nsHtml5NamedCharacters::VALUES[candidate]; 782 if (!val[1]) { 783 emitOrAppendOne(val, returnState); 784 } else { 785 emitOrAppendTwo(val, returnState); 786 } 787 if (charRefBufMark < charRefBufLen) { 788 if ((returnState & DATA_AND_RCDATA_MASK)) { 789 appendStrBuf(charRefBuf, charRefBufMark, 790 charRefBufLen - charRefBufMark); 791 } else { 792 tokenHandler->characters(charRefBuf, charRefBufMark, 793 charRefBufLen - charRefBufMark); 794 } 795 } 796 charRefBufLen = 0; 797 state = returnState; 798 NS_HTML5_CONTINUE(eofloop); 799 } 800 } 801 case CONSUME_NCR: 802 case DECIMAL_NRC_LOOP: 803 case HEX_NCR_LOOP: { 804 if (!seenDigits) { 805 errNoDigitsInNCR(); 806 emitOrAppendCharRefBuf(returnState); 807 state = returnState; 808 continue; 809 } else { 810 errCharRefLacksSemicolon(); 811 } 812 handleNcrValue(returnState); 813 state = returnState; 814 continue; 815 } 816 case CDATA_RSQB: { 817 tokenHandler->characters(nsHtml5Tokenizer::RSQB_RSQB, 0, 1); 818 NS_HTML5_BREAK(eofloop); 819 } 820 case CDATA_RSQB_RSQB: { 821 tokenHandler->characters(nsHtml5Tokenizer::RSQB_RSQB, 0, 2); 822 NS_HTML5_BREAK(eofloop); 823 } 824 case DATA: 825 default: { 826 NS_HTML5_BREAK(eofloop); 827 } 828 } 829 } 830 eofloop_end:; 831 tokenHandler->eof(); 832 return; 833 } 834 835 void nsHtml5Tokenizer::emitDoctypeToken(int32_t pos) { 836 RememberGt(pos); 837 cstart = pos + 1; 838 tokenHandler->doctype(doctypeName, publicIdentifier, systemIdentifier, 839 forceQuirks); 840 doctypeName = nullptr; 841 publicIdentifier.Release(); 842 publicIdentifier = nullptr; 843 systemIdentifier.Release(); 844 systemIdentifier = nullptr; 845 suspendIfRequestedAfterCurrentNonTextToken(); 846 } 847 848 void nsHtml5Tokenizer::suspendAfterCurrentTokenIfNotInText() { 849 switch (stateSave) { 850 case DATA: 851 case RCDATA: 852 case SCRIPT_DATA: 853 case RAWTEXT: 854 case SCRIPT_DATA_ESCAPED: 855 case PLAINTEXT: 856 case NON_DATA_END_TAG_NAME: 857 case SCRIPT_DATA_LESS_THAN_SIGN: 858 case SCRIPT_DATA_ESCAPE_START: 859 case SCRIPT_DATA_ESCAPE_START_DASH: 860 case SCRIPT_DATA_ESCAPED_DASH: 861 case SCRIPT_DATA_ESCAPED_DASH_DASH: 862 case RAWTEXT_RCDATA_LESS_THAN_SIGN: 863 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 864 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 865 case SCRIPT_DATA_DOUBLE_ESCAPED: 866 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 867 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 868 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 869 case SCRIPT_DATA_DOUBLE_ESCAPE_END: { 870 return; 871 } 872 case TAG_NAME: 873 case BEFORE_ATTRIBUTE_NAME: 874 case ATTRIBUTE_NAME: 875 case AFTER_ATTRIBUTE_NAME: 876 case BEFORE_ATTRIBUTE_VALUE: 877 case AFTER_ATTRIBUTE_VALUE_QUOTED: 878 case BOGUS_COMMENT: 879 case MARKUP_DECLARATION_OPEN: 880 case DOCTYPE: 881 case BEFORE_DOCTYPE_NAME: 882 case DOCTYPE_NAME: 883 case AFTER_DOCTYPE_NAME: 884 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 885 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 886 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 887 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 888 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 889 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 890 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 891 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 892 case BOGUS_DOCTYPE: 893 case COMMENT_START: 894 case COMMENT_START_DASH: 895 case COMMENT: 896 case COMMENT_END_DASH: 897 case COMMENT_END: 898 case COMMENT_END_BANG: 899 case TAG_OPEN: 900 case CLOSE_TAG_OPEN: 901 case MARKUP_DECLARATION_HYPHEN: 902 case MARKUP_DECLARATION_OCTYPE: 903 case DOCTYPE_UBLIC: 904 case DOCTYPE_YSTEM: 905 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 906 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 907 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 908 case SELF_CLOSING_START_TAG: 909 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 910 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 911 case ATTRIBUTE_VALUE_UNQUOTED: 912 case BOGUS_COMMENT_HYPHEN: 913 case COMMENT_LESSTHAN: 914 case COMMENT_LESSTHAN_BANG: 915 case COMMENT_LESSTHAN_BANG_DASH: 916 case COMMENT_LESSTHAN_BANG_DASH_DASH: 917 case CDATA_START: 918 case CDATA_SECTION: 919 case CDATA_RSQB: 920 case CDATA_RSQB_RSQB: 921 case PROCESSING_INSTRUCTION: 922 case PROCESSING_INSTRUCTION_QUESTION_MARK: { 923 break; 924 } 925 case CONSUME_CHARACTER_REFERENCE: 926 case CONSUME_NCR: 927 case CHARACTER_REFERENCE_TAIL: 928 case HEX_NCR_LOOP: 929 case DECIMAL_NRC_LOOP: 930 case HANDLE_NCR_VALUE: 931 case HANDLE_NCR_VALUE_RECONSUME: 932 case CHARACTER_REFERENCE_HILO_LOOKUP: { 933 if (returnStateSave == DATA || returnStateSave == RCDATA) { 934 return; 935 } 936 break; 937 } 938 default: { 939 MOZ_ASSERT(false, "Incomplete switch"); 940 return; 941 } 942 } 943 suspendAfterCurrentNonTextToken = true; 944 } 945 946 bool nsHtml5Tokenizer::suspensionAfterCurrentNonTextTokenPending() { 947 return suspendAfterCurrentNonTextToken; 948 } 949 950 bool nsHtml5Tokenizer::internalEncodingDeclaration( 951 nsHtml5String internalCharset) { 952 if (encodingDeclarationHandler) { 953 return encodingDeclarationHandler->internalEncodingDeclaration( 954 internalCharset); 955 } 956 return false; 957 } 958 959 void nsHtml5Tokenizer::end() { 960 if (!keepBuffer) { 961 strBuf = nullptr; 962 } 963 doctypeName = nullptr; 964 if (systemIdentifier) { 965 systemIdentifier.Release(); 966 systemIdentifier = nullptr; 967 } 968 if (publicIdentifier) { 969 publicIdentifier.Release(); 970 publicIdentifier = nullptr; 971 } 972 tagName = nullptr; 973 nonInternedTagName->setNameForNonInterned(nullptr, false); 974 attributeName = nullptr; 975 nonInternedAttributeName->setNameForNonInterned(nullptr); 976 tokenHandler->endTokenization(); 977 if (attributes) { 978 attributes->clear(0); 979 } 980 } 981 982 void nsHtml5Tokenizer::resetToDataState() { 983 clearStrBufAfterUse(); 984 charRefBufLen = 0; 985 stateSave = nsHtml5Tokenizer::DATA; 986 lastCR = false; 987 index = 0; 988 forceQuirks = false; 989 additional = '\0'; 990 entCol = -1; 991 firstCharKey = -1; 992 lo = 0; 993 hi = 0; 994 candidate = -1; 995 charRefBufMark = 0; 996 value = 0; 997 seenDigits = false; 998 suspendAfterCurrentNonTextToken = false; 999 endTag = false; 1000 shouldSuspend = false; 1001 initDoctypeFields(); 1002 containsHyphen = false; 1003 tagName = nullptr; 1004 attributeName = nullptr; 1005 if (newAttributesEachTime) { 1006 if (attributes) { 1007 delete attributes; 1008 attributes = nullptr; 1009 } 1010 } 1011 } 1012 1013 void nsHtml5Tokenizer::loadState(nsHtml5Tokenizer* other) { 1014 strBufLen = other->strBufLen; 1015 if (strBufLen > strBuf.length) { 1016 strBuf = jArray<char16_t, int32_t>::newJArray(strBufLen); 1017 } 1018 nsHtml5ArrayCopy::arraycopy(other->strBuf, strBuf, strBufLen); 1019 charRefBufLen = other->charRefBufLen; 1020 nsHtml5ArrayCopy::arraycopy(other->charRefBuf, charRefBuf, charRefBufLen); 1021 stateSave = other->stateSave; 1022 returnStateSave = other->returnStateSave; 1023 endTagExpectation = other->endTagExpectation; 1024 endTagExpectationAsArray = other->endTagExpectationAsArray; 1025 lastCR = other->lastCR; 1026 index = other->index; 1027 forceQuirks = other->forceQuirks; 1028 additional = other->additional; 1029 entCol = other->entCol; 1030 firstCharKey = other->firstCharKey; 1031 lo = other->lo; 1032 hi = other->hi; 1033 candidate = other->candidate; 1034 charRefBufMark = other->charRefBufMark; 1035 value = other->value; 1036 seenDigits = other->seenDigits; 1037 endTag = other->endTag; 1038 shouldSuspend = false; 1039 suspendAfterCurrentNonTextToken = false; 1040 doctypeName = other->doctypeName; 1041 systemIdentifier.Release(); 1042 if (!other->systemIdentifier) { 1043 systemIdentifier = nullptr; 1044 } else { 1045 systemIdentifier = 1046 nsHtml5Portability::newStringFromString(other->systemIdentifier); 1047 } 1048 publicIdentifier.Release(); 1049 if (!other->publicIdentifier) { 1050 publicIdentifier = nullptr; 1051 } else { 1052 publicIdentifier = 1053 nsHtml5Portability::newStringFromString(other->publicIdentifier); 1054 } 1055 containsHyphen = other->containsHyphen; 1056 if (!other->tagName) { 1057 tagName = nullptr; 1058 } else if (other->tagName->isInterned()) { 1059 tagName = other->tagName; 1060 } else { 1061 nonInternedTagName->setNameForNonInterned(other->tagName->getName(), 1062 other->tagName->isCustom()); 1063 tagName = nonInternedTagName; 1064 } 1065 if (!other->attributeName) { 1066 attributeName = nullptr; 1067 } else if (other->attributeName->isInterned()) { 1068 attributeName = other->attributeName; 1069 } else { 1070 nonInternedAttributeName->setNameForNonInterned( 1071 other->attributeName->getLocal(nsHtml5AttributeName::HTML)); 1072 attributeName = nonInternedAttributeName; 1073 } 1074 delete attributes; 1075 if (!other->attributes) { 1076 attributes = nullptr; 1077 } else { 1078 attributes = other->attributes->cloneAttributes(); 1079 } 1080 } 1081 1082 void nsHtml5Tokenizer::initializeWithoutStarting() { 1083 confident = false; 1084 if (!keepBuffer) { 1085 strBuf = nullptr; 1086 } 1087 line = 1; 1088 attributeLine = 1; 1089 resetToDataState(); 1090 } 1091 1092 void nsHtml5Tokenizer::setEncodingDeclarationHandler( 1093 nsHtml5StreamParser* encodingDeclarationHandler) { 1094 this->encodingDeclarationHandler = encodingDeclarationHandler; 1095 } 1096 1097 nsHtml5Tokenizer::~nsHtml5Tokenizer() { 1098 MOZ_COUNT_DTOR(nsHtml5Tokenizer); 1099 delete nonInternedTagName; 1100 nonInternedTagName = nullptr; 1101 delete nonInternedAttributeName; 1102 nonInternedAttributeName = nullptr; 1103 delete attributes; 1104 attributes = nullptr; 1105 } 1106 1107 void nsHtml5Tokenizer::initializeStatics() {} 1108 1109 void nsHtml5Tokenizer::releaseStatics() {} 1110 1111 #include "nsHtml5TokenizerCppSupplement.h"