Tokenizer.java (351838B)
1 /* 2 * Copyright (c) 2005-2007 Henri Sivonen 3 * Copyright (c) 2007-2017 Mozilla Foundation 4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 5 * Foundation, and Opera Software ASA. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 * DEALINGS IN THE SOFTWARE. 24 */ 25 26 /* 27 * The comments following this one that use the same comment syntax as this 28 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 29 * amended as of June 18 2008 and May 31 2010. 30 * That document came with this statement: 31 * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and 32 * Opera Software ASA. You are granted a license to use, reproduce and 33 * create derivative works of this document." 34 */ 35 36 package nu.validator.htmlparser.impl; 37 38 import java.util.HashMap; 39 40 import org.xml.sax.ErrorHandler; 41 import org.xml.sax.Locator; 42 import org.xml.sax.ext.Locator2; 43 import org.xml.sax.SAXException; 44 import org.xml.sax.SAXParseException; 45 46 import nu.validator.htmlparser.annotation.Auto; 47 import nu.validator.htmlparser.annotation.CharacterName; 48 import nu.validator.htmlparser.annotation.Const; 49 import nu.validator.htmlparser.annotation.Inline; 50 import nu.validator.htmlparser.annotation.Local; 51 import nu.validator.htmlparser.annotation.NoLength; 52 import nu.validator.htmlparser.common.EncodingDeclarationHandler; 53 import nu.validator.htmlparser.common.Interner; 54 import nu.validator.htmlparser.common.TokenHandler; 55 import nu.validator.htmlparser.common.XmlViolationPolicy; 56 57 /** 58 * An implementation of 59 * https://html.spec.whatwg.org/multipage/syntax.html#tokenization 60 * 61 * This class implements the <code>Locator</code> interface. This is not an 62 * incidental implementation detail: Users of this class are encouraged to make 63 * use of the <code>Locator</code> nature. 64 * 65 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer 66 * can be configured to treat these conditions as fatal or to coerce the infoset 67 * to something that XML 1.0 allows. 68 * 69 * @version $Id$ 70 * @author hsivonen 71 */ 72 public class Tokenizer implements Locator, Locator2 { 73 74 private static final int DATA_AND_RCDATA_MASK = ~1; 75 76 public static final int DATA = 0; 77 78 public static final int RCDATA = 1; 79 80 public static final int SCRIPT_DATA = 2; 81 82 public static final int RAWTEXT = 3; 83 84 public static final int SCRIPT_DATA_ESCAPED = 4; 85 86 public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5; 87 88 public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6; 89 90 public static final int ATTRIBUTE_VALUE_UNQUOTED = 7; 91 92 public static final int PLAINTEXT = 8; 93 94 public static final int TAG_OPEN = 9; 95 96 public static final int CLOSE_TAG_OPEN = 10; 97 98 public static final int TAG_NAME = 11; 99 100 public static final int BEFORE_ATTRIBUTE_NAME = 12; 101 102 public static final int ATTRIBUTE_NAME = 13; 103 104 public static final int AFTER_ATTRIBUTE_NAME = 14; 105 106 public static final int BEFORE_ATTRIBUTE_VALUE = 15; 107 108 public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16; 109 110 public static final int BOGUS_COMMENT = 17; 111 112 public static final int MARKUP_DECLARATION_OPEN = 18; 113 114 public static final int DOCTYPE = 19; 115 116 public static final int BEFORE_DOCTYPE_NAME = 20; 117 118 public static final int DOCTYPE_NAME = 21; 119 120 public static final int AFTER_DOCTYPE_NAME = 22; 121 122 public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23; 123 124 public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24; 125 126 public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25; 127 128 public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26; 129 130 public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27; 131 132 public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28; 133 134 public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29; 135 136 public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30; 137 138 public static final int BOGUS_DOCTYPE = 31; 139 140 public static final int COMMENT_START = 32; 141 142 public static final int COMMENT_START_DASH = 33; 143 144 public static final int COMMENT = 34; 145 146 public static final int COMMENT_END_DASH = 35; 147 148 public static final int COMMENT_END = 36; 149 150 public static final int COMMENT_END_BANG = 37; 151 152 public static final int NON_DATA_END_TAG_NAME = 38; 153 154 public static final int MARKUP_DECLARATION_HYPHEN = 39; 155 156 public static final int MARKUP_DECLARATION_OCTYPE = 40; 157 158 public static final int DOCTYPE_UBLIC = 41; 159 160 public static final int DOCTYPE_YSTEM = 42; 161 162 public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43; 163 164 public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44; 165 166 public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45; 167 168 public static final int CONSUME_CHARACTER_REFERENCE = 46; 169 170 public static final int CONSUME_NCR = 47; 171 172 public static final int CHARACTER_REFERENCE_TAIL = 48; 173 174 public static final int HEX_NCR_LOOP = 49; 175 176 public static final int DECIMAL_NRC_LOOP = 50; 177 178 public static final int HANDLE_NCR_VALUE = 51; 179 180 public static final int HANDLE_NCR_VALUE_RECONSUME = 52; 181 182 public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53; 183 184 public static final int SELF_CLOSING_START_TAG = 54; 185 186 public static final int CDATA_START = 55; 187 188 public static final int CDATA_SECTION = 56; 189 190 public static final int CDATA_RSQB = 57; 191 192 public static final int CDATA_RSQB_RSQB = 58; 193 194 public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59; 195 196 public static final int SCRIPT_DATA_ESCAPE_START = 60; 197 198 public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61; 199 200 public static final int SCRIPT_DATA_ESCAPED_DASH = 62; 201 202 public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63; 203 204 public static final int BOGUS_COMMENT_HYPHEN = 64; 205 206 public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65; 207 208 public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66; 209 210 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67; 211 212 public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68; 213 214 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69; 215 216 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70; 217 218 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71; 219 220 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72; 221 222 public static final int PROCESSING_INSTRUCTION = 73; 223 224 public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74; 225 226 public static final int COMMENT_LESSTHAN = 76; 227 228 public static final int COMMENT_LESSTHAN_BANG = 77; 229 230 public static final int COMMENT_LESSTHAN_BANG_DASH = 78; 231 232 public static final int COMMENT_LESSTHAN_BANG_DASH_DASH = 79; 233 234 /** 235 * Magic value for UTF-16 operations. 236 */ 237 private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10)); 238 239 /** 240 * UTF-16 code unit array containing less than and greater than for emitting 241 * those characters on certain parse errors. 242 */ 243 private static final @NoLength char[] LT_GT = { '<', '>' }; 244 245 /** 246 * UTF-16 code unit array containing less than and solidus for emitting 247 * those characters on certain parse errors. 248 */ 249 private static final @NoLength char[] LT_SOLIDUS = { '<', '/' }; 250 251 /** 252 * UTF-16 code unit array containing ]] for emitting those characters on 253 * state transitions. 254 */ 255 private static final @NoLength char[] RSQB_RSQB = { ']', ']' }; 256 257 /** 258 * Array version of U+FFFD. 259 */ 260 private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' }; 261 262 // [NOCPP[ 263 264 /** 265 * Array version of space. 266 */ 267 private static final @NoLength char[] SPACE = { ' ' }; 268 269 // ]NOCPP] 270 271 /** 272 * Array version of line feed. 273 */ 274 private static final @NoLength char[] LF = { '\n' }; 275 276 /** 277 * "CDATA[" as <code>char[]</code> 278 */ 279 private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T', 280 'A', '[' }; 281 282 /** 283 * "octype" as <code>char[]</code> 284 */ 285 private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p', 286 'e' }; 287 288 /** 289 * "ublic" as <code>char[]</code> 290 */ 291 private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' }; 292 293 /** 294 * "ystem" as <code>char[]</code> 295 */ 296 private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' }; 297 298 private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' }; 299 300 private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' }; 301 302 private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' }; 303 304 private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't', 305 'e', 'x', 't' }; 306 307 private static final char[] XMP_ARR = { 'x', 'm', 'p' }; 308 309 private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r', 310 'e', 'a' }; 311 312 private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' }; 313 314 private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e', 315 'd' }; 316 317 private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i', 318 'p', 't' }; 319 320 private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm', 321 'e', 's' }; 322 323 /** 324 * The token handler. 325 */ 326 protected final TokenHandler tokenHandler; 327 328 protected EncodingDeclarationHandler encodingDeclarationHandler; 329 330 // [NOCPP[ 331 332 /** 333 * The error handler. 334 */ 335 protected ErrorHandler errorHandler; 336 337 // ]NOCPP] 338 339 /** 340 * Whether the previous char read was CR. 341 */ 342 protected boolean lastCR; 343 344 protected int stateSave; 345 346 private int returnStateSave; 347 348 protected int index; 349 350 private boolean forceQuirks; 351 352 private char additional; 353 354 private int entCol; 355 356 private int firstCharKey; 357 358 private int lo; 359 360 private int hi; 361 362 private int candidate; 363 364 private int charRefBufMark; 365 366 protected int value; 367 368 private boolean seenDigits; 369 370 private boolean suspendAfterCurrentNonTextToken; 371 372 protected int cstart; 373 374 /** 375 * The SAX public id for the resource being tokenized. (Only passed to back 376 * as part of locator data.) 377 */ 378 private String publicId; 379 380 /** 381 * The SAX system id for the resource being tokenized. (Only passed to back 382 * as part of locator data.) 383 */ 384 private String systemId; 385 386 /** 387 * Buffer for bufferable things other than those that fit the description 388 * of <code>charRefBuf</code>. 389 */ 390 private @Auto char[] strBuf; 391 392 /** 393 * Number of significant <code>char</code>s in <code>strBuf</code>. 394 */ 395 private int strBufLen; 396 397 /** 398 * Buffer for characters that might form a character reference but may 399 * end up not forming one. 400 */ 401 private final @Auto char[] charRefBuf; 402 403 /** 404 * Number of significant <code>char</code>s in <code>charRefBuf</code>. 405 */ 406 private int charRefBufLen; 407 408 /** 409 * Buffer for expanding NCRs falling into the Basic Multilingual Plane. 410 */ 411 private final @Auto char[] bmpChar; 412 413 /** 414 * Buffer for expanding astral NCRs. 415 */ 416 private final @Auto char[] astralChar; 417 418 /** 419 * The element whose end tag closes the current CDATA or RCDATA element. 420 */ 421 protected ElementName endTagExpectation = null; 422 423 private char[] endTagExpectationAsArray; // not @Auto! 424 425 /** 426 * <code>true</code> if tokenizing an end tag 427 */ 428 protected boolean endTag; 429 430 /** 431 * <code>true</code> iff the current element/attribute name contains 432 * a hyphen. 433 */ 434 private boolean containsHyphen; 435 436 /** 437 * The current tag token name. One of 438 * 1) null, 439 * 2) non-owning reference to nonInternedTagName 440 * 3) non-owning reference to a pre-interned ElementName 441 */ 442 private ElementName tagName = null; 443 444 /** 445 * The recycled ElementName instance for the non-pre-interned cases. 446 */ 447 private ElementName nonInternedTagName = null; 448 449 /** 450 * The current attribute name. 451 */ 452 protected AttributeName attributeName = null; 453 454 // CPPONLY: private AttributeName nonInternedAttributeName = null; 455 456 // [NOCPP[ 457 458 /** 459 * Whether comment tokens are emitted. 460 */ 461 private boolean wantsComments = false; 462 463 /** 464 * Whether the stream is past the first 1024 bytes. 465 */ 466 private boolean metaBoundaryPassed; 467 468 // ]NOCPP] 469 470 /** 471 * The name of the current doctype token. 472 */ 473 private @Local String doctypeName; 474 475 /** 476 * The public id of the current doctype token. 477 */ 478 private String publicIdentifier; 479 480 /** 481 * The system id of the current doctype token. 482 */ 483 private String systemIdentifier; 484 485 /** 486 * The attribute holder. 487 */ 488 private HtmlAttributes attributes; 489 490 // [NOCPP[ 491 492 /** 493 * The policy for vertical tab and form feed. 494 */ 495 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET; 496 497 /** 498 * The policy for comments. 499 */ 500 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET; 501 502 private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET; 503 504 private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET; 505 506 private int mappingLangToXmlLang; 507 508 // ]NOCPP] 509 510 private final boolean newAttributesEachTime; 511 512 private boolean shouldSuspend; 513 514 private boolean keepBuffer; 515 516 protected boolean confident; 517 518 private int line; 519 520 /* 521 * The line number of the current attribute. First set to the line of the 522 * attribute name and if there is a value, set to the line the value 523 * started on. 524 */ 525 // CPPONLY: private int attributeLine; 526 527 private Interner interner; 528 529 // CPPONLY: private boolean viewingXmlSource; 530 531 // [NOCPP[ 532 533 protected LocatorImpl ampersandLocation; 534 535 public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) { 536 this.tokenHandler = tokenHandler; 537 this.encodingDeclarationHandler = null; 538 this.lastCR = false; 539 this.stateSave = 0; 540 this.returnStateSave = 0; 541 this.index = 0; 542 this.forceQuirks = false; 543 this.additional = '\u0000'; 544 this.entCol = 0; 545 this.firstCharKey = 0; 546 this.lo = 0; 547 this.hi = 0; 548 this.candidate = 0; 549 this.charRefBufMark = 0; 550 this.value = 0; 551 this.seenDigits = false; 552 this.suspendAfterCurrentNonTextToken = false; 553 this.cstart = 0; 554 this.strBufLen = 0; 555 this.newAttributesEachTime = newAttributesEachTime; 556 // ∳ is the longest valid char ref and 557 // the semicolon never gets appended to the buffer. 558 this.charRefBuf = new char[32]; 559 this.charRefBufLen = 0; 560 this.bmpChar = new char[1]; 561 this.astralChar = new char[2]; 562 this.endTagExpectation = null; 563 this.endTagExpectationAsArray = null; 564 this.endTag = false; 565 this.containsHyphen = false; 566 this.tagName = null; 567 this.nonInternedTagName = new ElementName(); 568 this.attributeName = null; 569 // CPPONLY: this.nonInternedAttributeName = new AttributeName(); 570 this.doctypeName = null; 571 this.publicIdentifier = null; 572 this.systemIdentifier = null; 573 this.attributes = null; 574 this.shouldSuspend = false; 575 this.keepBuffer = false; 576 this.confident = false; 577 this.line = 0; 578 // CPPONLY: this.attributeLine = 0; 579 this.interner = null; 580 } 581 582 // ]NOCPP] 583 584 /** 585 * The constructor. 586 * 587 * @param tokenHandler 588 * the handler for receiving tokens 589 */ 590 public Tokenizer(TokenHandler tokenHandler 591 // CPPONLY: , boolean viewingXmlSource 592 ) { 593 this.tokenHandler = tokenHandler; 594 this.encodingDeclarationHandler = null; 595 // [NOCPP[ 596 this.newAttributesEachTime = false; 597 // ]NOCPP] 598 this.lastCR = false; 599 this.stateSave = 0; 600 this.returnStateSave = 0; 601 this.index = 0; 602 this.forceQuirks = false; 603 this.additional = '\u0000'; 604 this.entCol = 0; 605 this.firstCharKey = 0; 606 this.lo = 0; 607 this.hi = 0; 608 this.candidate = 0; 609 this.charRefBufMark = 0; 610 this.value = 0; 611 this.seenDigits = false; 612 this.suspendAfterCurrentNonTextToken = false; 613 this.cstart = 0; 614 this.strBufLen = 0; 615 // ∳ is the longest valid char ref and 616 // the semicolon never gets appended to the buffer. 617 this.charRefBuf = new char[32]; 618 this.charRefBufLen = 0; 619 this.bmpChar = new char[1]; 620 this.astralChar = new char[2]; 621 this.endTagExpectation = null; 622 this.endTagExpectationAsArray = null; 623 this.endTag = false; 624 this.containsHyphen = false; 625 this.tagName = null; 626 this.nonInternedTagName = new ElementName(); 627 this.attributeName = null; 628 // CPPONLY: this.nonInternedAttributeName = new AttributeName(); 629 this.doctypeName = null; 630 this.publicIdentifier = null; 631 this.systemIdentifier = null; 632 // [NOCPP[ 633 this.attributes = null; 634 // ]NOCPP] 635 // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null; 636 // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder(); 637 this.shouldSuspend = false; 638 this.keepBuffer = false; 639 this.confident = false; 640 this.line = 0; 641 // CPPONLY: this.attributeLine = 0; 642 this.interner = null; 643 // CPPONLY: this.viewingXmlSource = viewingXmlSource; 644 } 645 646 public void setInterner(Interner interner) { 647 this.interner = interner; 648 } 649 650 public void initLocation(String newPublicId, String newSystemId) { 651 this.systemId = newSystemId; 652 this.publicId = newPublicId; 653 654 } 655 656 // CPPONLY: boolean isViewingXmlSource() { 657 // CPPONLY: return viewingXmlSource; 658 // CPPONLY: } 659 660 public void setKeepBuffer(boolean keepBuffer) { 661 this.keepBuffer = keepBuffer; 662 } 663 664 public boolean dropBufferIfLongerThan(int length) { 665 if (strBuf.length > length) { 666 strBuf = null; 667 return true; 668 } 669 return false; 670 } 671 672 // [NOCPP[ 673 674 /** 675 * Returns the mappingLangToXmlLang. 676 * 677 * @return the mappingLangToXmlLang 678 */ 679 public boolean isMappingLangToXmlLang() { 680 return mappingLangToXmlLang == AttributeName.HTML_LANG; 681 } 682 683 /** 684 * Sets the mappingLangToXmlLang. 685 * 686 * @param mappingLangToXmlLang 687 * the mappingLangToXmlLang to set 688 */ 689 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { 690 this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG 691 : AttributeName.HTML; 692 } 693 694 /** 695 * Sets the error handler. 696 * 697 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 698 */ 699 public void setErrorHandler(ErrorHandler eh) { 700 this.errorHandler = eh; 701 } 702 703 public ErrorHandler getErrorHandler() { 704 return this.errorHandler; 705 } 706 707 /** 708 * Gets the errorProfile. 709 * 710 * @param errorProfile 711 */ 712 public HashMap getErrorProfile() { 713 return null; 714 } 715 716 /** 717 * Sets the commentPolicy. 718 * 719 * @param commentPolicy 720 * the commentPolicy to set 721 */ 722 public void setCommentPolicy(XmlViolationPolicy commentPolicy) { 723 this.commentPolicy = commentPolicy; 724 } 725 726 /** 727 * Sets the contentNonXmlCharPolicy. 728 * 729 * @param contentNonXmlCharPolicy 730 * the contentNonXmlCharPolicy to set 731 */ 732 public void setContentNonXmlCharPolicy( 733 XmlViolationPolicy contentNonXmlCharPolicy) { 734 if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) { 735 throw new IllegalArgumentException( 736 "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW."); 737 } 738 } 739 740 /** 741 * Sets the contentSpacePolicy. 742 * 743 * @param contentSpacePolicy 744 * the contentSpacePolicy to set 745 */ 746 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { 747 this.contentSpacePolicy = contentSpacePolicy; 748 } 749 750 /** 751 * Sets the xmlnsPolicy. 752 * 753 * @param xmlnsPolicy 754 * the xmlnsPolicy to set 755 */ 756 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) { 757 if (xmlnsPolicy == XmlViolationPolicy.FATAL) { 758 throw new IllegalArgumentException("Can't use FATAL here."); 759 } 760 this.xmlnsPolicy = xmlnsPolicy; 761 } 762 763 public void setNamePolicy(XmlViolationPolicy namePolicy) { 764 this.namePolicy = namePolicy; 765 } 766 767 // ]NOCPP] 768 769 // For the token handler to call 770 771 /** 772 * Sets the tokenizer state and the associated element name. This should 773 * only ever used to put the tokenizer into one of the states that have 774 * a special end tag expectation. 775 * 776 * @param specialTokenizerState 777 * the tokenizer state to set 778 */ 779 public void setState(int specialTokenizerState) { 780 this.stateSave = specialTokenizerState; 781 this.endTagExpectation = null; 782 this.endTagExpectationAsArray = null; 783 } 784 785 // [NOCPP[ 786 787 /** 788 * Sets the tokenizer state and the associated element name. This should 789 * only ever used to put the tokenizer into one of the states that have 790 * a special end tag expectation. For use from the tokenizer test harness. 791 * 792 * @param specialTokenizerState 793 * the tokenizer state to set 794 * @param endTagExpectation 795 * the expected end tag for transitioning back to normal 796 */ 797 public void setStateAndEndTagExpectation(int specialTokenizerState, 798 @Local String endTagExpectation) { 799 this.stateSave = specialTokenizerState; 800 if (specialTokenizerState == Tokenizer.DATA) { 801 return; 802 } 803 @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation); 804 this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 805 asArray.length); 806 assert this.endTagExpectation != null; 807 endTagExpectationToArray(); 808 } 809 810 // ]NOCPP] 811 812 /** 813 * Sets the tokenizer state and the associated element name. This should 814 * only ever used to put the tokenizer into one of the states that have 815 * a special end tag expectation. 816 * 817 * @param specialTokenizerState 818 * the tokenizer state to set 819 * @param endTagExpectation 820 * the expected end tag for transitioning back to normal 821 */ 822 public void setStateAndEndTagExpectation(int specialTokenizerState, 823 ElementName endTagExpectation) { 824 this.stateSave = specialTokenizerState; 825 this.endTagExpectation = endTagExpectation; 826 endTagExpectationToArray(); 827 } 828 829 private void endTagExpectationToArray() { 830 switch (endTagExpectation.getGroup()) { 831 case TreeBuilder.TITLE: 832 endTagExpectationAsArray = TITLE_ARR; 833 return; 834 case TreeBuilder.SCRIPT: 835 endTagExpectationAsArray = SCRIPT_ARR; 836 return; 837 case TreeBuilder.STYLE: 838 endTagExpectationAsArray = STYLE_ARR; 839 return; 840 case TreeBuilder.PLAINTEXT: 841 endTagExpectationAsArray = PLAINTEXT_ARR; 842 return; 843 case TreeBuilder.XMP: 844 endTagExpectationAsArray = XMP_ARR; 845 return; 846 case TreeBuilder.TEXTAREA: 847 endTagExpectationAsArray = TEXTAREA_ARR; 848 return; 849 case TreeBuilder.IFRAME: 850 endTagExpectationAsArray = IFRAME_ARR; 851 return; 852 case TreeBuilder.NOEMBED: 853 endTagExpectationAsArray = NOEMBED_ARR; 854 return; 855 case TreeBuilder.NOSCRIPT: 856 endTagExpectationAsArray = NOSCRIPT_ARR; 857 return; 858 case TreeBuilder.NOFRAMES: 859 endTagExpectationAsArray = NOFRAMES_ARR; 860 return; 861 default: 862 assert false: "Bad end tag expectation."; 863 return; 864 } 865 } 866 867 /** 868 * For C++ use only. 869 */ 870 public void setLineNumber(int line) { 871 // CPPONLY: this.attributeLine = line; // XXX is this needed? 872 this.line = line; 873 } 874 875 // start Locator impl 876 877 /** 878 * @see org.xml.sax.Locator#getLineNumber() 879 */ 880 @Inline public int getLineNumber() { 881 return line; 882 } 883 884 // [NOCPP[ 885 886 /** 887 * @see org.xml.sax.Locator#getColumnNumber() 888 */ 889 @Inline public int getColumnNumber() { 890 return -1; 891 } 892 893 /** 894 * @see org.xml.sax.Locator#getPublicId() 895 */ 896 public String getPublicId() { 897 return publicId; 898 } 899 900 /** 901 * @see org.xml.sax.Locator#getSystemId() 902 */ 903 public String getSystemId() { 904 return systemId; 905 } 906 907 /** 908 * @see org.xml.sax.ext.Locator2#getXMLVersion() 909 */ 910 public String getXMLVersion() { 911 return "1.0"; 912 } 913 914 /** 915 * @see org.xml.sax.ext.Locator2#getXMLVersion() 916 */ 917 public String getEncoding() { 918 try { 919 return encodingDeclarationHandler == null ? null : encodingDeclarationHandler.getCharacterEncoding(); 920 } catch (SAXException e) { 921 return null; 922 } 923 } 924 925 // end Locator impl 926 927 // end public API 928 929 public void notifyAboutMetaBoundary() { 930 metaBoundaryPassed = true; 931 } 932 933 // ]NOCPP] 934 935 @Inline HtmlAttributes emptyAttributes() { 936 // [NOCPP[ 937 if (newAttributesEachTime) { 938 return new HtmlAttributes(mappingLangToXmlLang); 939 } else { 940 // ]NOCPP] 941 return HtmlAttributes.EMPTY_ATTRIBUTES; 942 // [NOCPP[ 943 } 944 // ]NOCPP] 945 } 946 947 private void appendCharRefBuf(char c) { 948 // CPPONLY: assert charRefBufLen < charRefBuf.length: 949 // CPPONLY: "RELEASE: Attempted to overrun charRefBuf!"; 950 charRefBuf[charRefBufLen++] = c; 951 } 952 953 private void emitOrAppendCharRefBuf(int returnState) throws SAXException { 954 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 955 appendCharRefBufToStrBuf(); 956 } else { 957 if (charRefBufLen > 0) { 958 tokenHandler.characters(charRefBuf, 0, charRefBufLen); 959 charRefBufLen = 0; 960 } 961 } 962 } 963 964 @Inline private void clearStrBufAfterUse() { 965 strBufLen = 0; 966 } 967 968 @Inline private void clearStrBufBeforeUse() { 969 assert strBufLen == 0: "strBufLen not reset after previous use!"; 970 strBufLen = 0; // no-op in the absence of bugs 971 } 972 973 @Inline private void clearStrBufAfterOneHyphen() { 974 assert strBufLen == 1: "strBufLen length not one!"; 975 assert strBuf[0] == '-': "strBuf does not start with a hyphen!"; 976 strBufLen = 0; 977 } 978 979 /** 980 * Appends to the buffer. 981 * 982 * @param c 983 * the UTF-16 code unit to append 984 */ 985 @Inline private void appendStrBuf(char c) { 986 // CPPONLY: if (strBufLen == strBuf.length) { 987 // CPPONLY: EnsureBufferSpaceShouldNeverHappen(1); 988 // CPPONLY: } 989 strBuf[strBufLen++] = c; 990 } 991 992 /** 993 * The buffer as a String. Currently only used for error reporting. 994 * 995 * <p> 996 * C++ memory note: The return value must be released. 997 * 998 * @return the buffer as a string 999 */ 1000 @Inline protected String strBufToString() { 1001 // CPPONLY: String digitAtom = TryAtomizeForSingleDigit(); 1002 // CPPONLY: if (digitAtom) { 1003 // CPPONLY: return digitAtom; 1004 // CPPONLY: } 1005 // CPPONLY: 1006 // CPPONLY: boolean maybeAtomize = false; 1007 // CPPONLY: if (!newAttributesEachTime) { 1008 // CPPONLY: if (attributeName == AttributeName.CLASS || 1009 // CPPONLY: attributeName == AttributeName.TYPE) { 1010 // CPPONLY: maybeAtomize = true; 1011 // CPPONLY: } 1012 // CPPONLY: } 1013 // CPPONLY: 1014 String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen 1015 // CPPONLY: , tokenHandler, maybeAtomize 1016 ); 1017 clearStrBufAfterUse(); 1018 return str; 1019 } 1020 1021 /** 1022 * Returns the buffer as a local name. The return value is released in 1023 * emitDoctypeToken(). 1024 * 1025 * @return the buffer as local name 1026 */ 1027 @Inline private void strBufToDoctypeName() { 1028 doctypeName = Portability.newLocalNameFromBuffer(strBuf, strBufLen, interner); 1029 clearStrBufAfterUse(); 1030 } 1031 1032 /** 1033 * Emits the buffer as character tokens. 1034 * 1035 * @throws SAXException 1036 * if the token handler threw 1037 */ 1038 @Inline private void emitStrBuf() throws SAXException { 1039 if (strBufLen > 0) { 1040 tokenHandler.characters(strBuf, 0, strBufLen); 1041 clearStrBufAfterUse(); 1042 } 1043 } 1044 1045 @Inline private void appendSecondHyphenToBogusComment() throws SAXException { 1046 // [NOCPP[ 1047 switch (commentPolicy) { 1048 case ALTER_INFOSET: 1049 appendStrBuf(' '); 1050 // CPPONLY: MOZ_FALLTHROUGH; 1051 case ALLOW: 1052 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1053 // ]NOCPP] 1054 appendStrBuf('-'); 1055 // [NOCPP[ 1056 break; 1057 case FATAL: 1058 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1059 break; 1060 } 1061 // ]NOCPP] 1062 } 1063 1064 // [NOCPP[ 1065 private void maybeAppendSpaceToBogusComment() throws SAXException { 1066 switch (commentPolicy) { 1067 case ALTER_INFOSET: 1068 appendStrBuf(' '); 1069 // CPPONLY: MOZ_FALLTHROUGH; 1070 case ALLOW: 1071 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); 1072 break; 1073 case FATAL: 1074 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); 1075 break; 1076 } 1077 } 1078 1079 // ]NOCPP] 1080 1081 @Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c, boolean reportedConsecutiveHyphens) 1082 throws SAXException { 1083 // [NOCPP[ 1084 switch (commentPolicy) { 1085 case ALTER_INFOSET: 1086 strBufLen--; 1087 // WARNING!!! This expands the worst case of the buffer length 1088 // given the length of input! 1089 appendStrBuf(' '); 1090 appendStrBuf('-'); 1091 // CPPONLY: MOZ_FALLTHROUGH; 1092 case ALLOW: 1093 if (!reportedConsecutiveHyphens) { 1094 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1095 } 1096 // ]NOCPP] 1097 appendStrBuf(c); 1098 // [NOCPP[ 1099 break; 1100 case FATAL: 1101 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1102 break; 1103 } 1104 // ]NOCPP] 1105 } 1106 1107 @Inline private void appendStrBuf(@NoLength char[] buffer, int offset, int length) throws SAXException { 1108 // Years of crash stats have shown that the this addition doesn't overflow, as it logically 1109 // shouldn't. 1110 int newLen = strBufLen + length; 1111 // CPPONLY: if (strBuf.length < newLen) { 1112 // CPPONLY: EnsureBufferSpaceShouldNeverHappen(length); 1113 // CPPONLY: } 1114 System.arraycopy(buffer, offset, strBuf, strBufLen, length); 1115 strBufLen = newLen; 1116 } 1117 1118 /** 1119 * Append the contents of the char reference buffer to the main one. 1120 */ 1121 @Inline private void appendCharRefBufToStrBuf() throws SAXException { 1122 appendStrBuf(charRefBuf, 0, charRefBufLen); 1123 charRefBufLen = 0; 1124 } 1125 1126 /** 1127 * Emits the current comment token. 1128 * 1129 * NOTE: The method may set <code>shouldSuspend</code>, so the caller 1130 * must have this pattern after the state's <code>transition</code> call: 1131 * 1132 * <pre> 1133 * if (shouldSuspend) { 1134 * break stateloop; 1135 * } 1136 * continue stateloop; 1137 * </pre> 1138 * 1139 * @param pos 1140 * TODO 1141 * 1142 * @throws SAXException 1143 */ 1144 private void emitComment(int provisionalHyphens, int pos) 1145 throws SAXException { 1146 // CPPONLY: RememberGt(pos); 1147 // [NOCPP[ 1148 if (wantsComments) { 1149 // ]NOCPP] 1150 tokenHandler.comment(strBuf, 0, strBufLen 1151 - provisionalHyphens); 1152 // [NOCPP[ 1153 } 1154 // ]NOCPP] 1155 clearStrBufAfterUse(); 1156 cstart = pos + 1; 1157 suspendIfRequestedAfterCurrentNonTextToken(); 1158 } 1159 1160 /** 1161 * Flushes coalesced character tokens. 1162 * 1163 * @param buf 1164 * TODO 1165 * @param pos 1166 * TODO 1167 * 1168 * @throws SAXException 1169 */ 1170 protected void flushChars(@NoLength char[] buf, int pos) 1171 throws SAXException { 1172 if (pos > cstart) { 1173 tokenHandler.characters(buf, cstart, pos - cstart); 1174 } 1175 cstart = Integer.MAX_VALUE; 1176 } 1177 1178 /** 1179 * Reports an condition that would make the infoset incompatible with XML 1180 * 1.0 as fatal. 1181 * 1182 * @param message 1183 * the message 1184 * @throws SAXException 1185 * @throws SAXParseException 1186 */ 1187 public void fatal(String message) throws SAXException { 1188 SAXParseException spe = new SAXParseException(message, this); 1189 if (errorHandler != null) { 1190 errorHandler.fatalError(spe); 1191 } 1192 throw spe; 1193 } 1194 1195 /** 1196 * Reports a Parse Error. 1197 * 1198 * @param message 1199 * the message 1200 * @throws SAXException 1201 */ 1202 public void err(String message) throws SAXException { 1203 if (errorHandler == null) { 1204 return; 1205 } 1206 SAXParseException spe = new SAXParseException(message, this); 1207 errorHandler.error(spe); 1208 } 1209 1210 public void errTreeBuilder(String message) throws SAXException { 1211 ErrorHandler eh = null; 1212 if (tokenHandler instanceof TreeBuilder<?>) { 1213 TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler; 1214 eh = treeBuilder.getErrorHandler(); 1215 } 1216 if (eh == null) { 1217 eh = errorHandler; 1218 } 1219 if (eh == null) { 1220 return; 1221 } 1222 SAXParseException spe = new SAXParseException(message, this); 1223 eh.error(spe); 1224 } 1225 1226 /** 1227 * Reports a warning 1228 * 1229 * @param message 1230 * the message 1231 * @throws SAXException 1232 */ 1233 public void warn(String message) throws SAXException { 1234 if (errorHandler == null) { 1235 return; 1236 } 1237 SAXParseException spe = new SAXParseException(message, this); 1238 errorHandler.warning(spe); 1239 } 1240 1241 private void strBufToElementNameString() { 1242 if (containsHyphen) { 1243 // We've got a custom element or annotation-xml. 1244 @Local String annotationName = ElementName.ANNOTATION_XML.getName(); 1245 if (Portability.localEqualsBuffer(annotationName, strBuf, strBufLen)) { 1246 tagName = ElementName.ANNOTATION_XML; 1247 } else { 1248 nonInternedTagName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen, 1249 interner) 1250 // CPPONLY: , true 1251 ); 1252 tagName = nonInternedTagName; 1253 } 1254 } else { 1255 tagName = ElementName.elementNameByBuffer(strBuf, strBufLen); 1256 if (tagName == null) { 1257 nonInternedTagName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen, 1258 interner) 1259 // CPPONLY: , false 1260 ); 1261 tagName = nonInternedTagName; 1262 } 1263 } 1264 containsHyphen = false; 1265 clearStrBufAfterUse(); 1266 } 1267 1268 /** 1269 * Emits a tag token. 1270 * 1271 * NOTE: The method may set <code>shouldSuspend</code>, so the caller 1272 * must have this pattern after the state's <code>transition</code> call: 1273 * <pre> 1274 * if (shouldSuspend) { 1275 * break stateloop; 1276 * } 1277 * continue stateloop; 1278 * </pre> 1279 * 1280 * @param selfClosing 1281 * @param pos 1282 * @return 1283 * @throws SAXException 1284 */ 1285 private int emitCurrentTagToken(boolean selfClosing, int pos) 1286 throws SAXException { 1287 // CPPONLY: RememberGt(pos); 1288 cstart = pos + 1; 1289 maybeErrSlashInEndTag(selfClosing); 1290 stateSave = Tokenizer.DATA; 1291 HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES 1292 : attributes); 1293 if (endTag) { 1294 /* 1295 * When an end tag token is emitted, the content model flag must be 1296 * switched to the PCDATA state. 1297 */ 1298 maybeErrAttributesOnEndTag(attrs); 1299 // CPPONLY: if (!viewingXmlSource) { 1300 tokenHandler.endTag(tagName); 1301 // CPPONLY: } 1302 // CPPONLY: if (newAttributesEachTime) { 1303 // CPPONLY: Portability.delete(attributes); 1304 // CPPONLY: attributes = null; 1305 // CPPONLY: } 1306 } else { 1307 // CPPONLY: if (viewingXmlSource) { 1308 // CPPONLY: assert newAttributesEachTime; 1309 // CPPONLY: Portability.delete(attributes); 1310 // CPPONLY: attributes = null; 1311 // CPPONLY: } else { 1312 tokenHandler.startTag(tagName, attrs, selfClosing); 1313 // CPPONLY: } 1314 } 1315 tagName = null; 1316 if (newAttributesEachTime) { 1317 attributes = null; 1318 } else { 1319 attributes.clear(mappingLangToXmlLang); 1320 } 1321 /* 1322 * The token handler may have called setStateAndEndTagExpectation 1323 * and changed stateSave since the start of this method. 1324 */ 1325 suspendIfRequestedAfterCurrentNonTextToken(); 1326 return stateSave; 1327 } 1328 1329 private void attributeNameComplete() throws SAXException { 1330 attributeName = AttributeName.nameByBuffer(strBuf, strBufLen, interner); 1331 if (attributeName == null) { 1332 // [NOCPP[ 1333 attributeName = AttributeName.createAttributeName( 1334 Portability.newLocalNameFromBuffer(strBuf, strBufLen, 1335 interner), 1336 namePolicy != XmlViolationPolicy.ALLOW); 1337 // ]NOCPP] 1338 // CPPONLY: nonInternedAttributeName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen, interner)); 1339 // CPPONLY: attributeName = nonInternedAttributeName; 1340 } 1341 clearStrBufAfterUse(); 1342 1343 if (attributes == null) { 1344 attributes = new HtmlAttributes(mappingLangToXmlLang); 1345 } 1346 1347 /* 1348 * When the user agent leaves the attribute name state (and before 1349 * emitting the tag token, if appropriate), the complete attribute's 1350 * name must be compared to the other attributes on the same token; if 1351 * there is already an attribute on the token with the exact same name, 1352 * then this is a parse error and the new attribute must be dropped, 1353 * along with the value that gets associated with it (if any). 1354 */ 1355 if (attributes.contains(attributeName)) { 1356 errDuplicateAttribute(); 1357 attributeName = null; 1358 } 1359 } 1360 1361 private void addAttributeWithoutValue() throws SAXException { 1362 noteAttributeWithoutValue(); 1363 1364 // [NOCPP[ 1365 if (metaBoundaryPassed && AttributeName.CHARSET == attributeName 1366 && ElementName.META == tagName) { 1367 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 1024 bytes."); 1368 } 1369 // ]NOCPP] 1370 if (attributeName != null) { 1371 // [NOCPP[ 1372 if (AttributeName.SRC == attributeName 1373 || AttributeName.HREF == attributeName) { 1374 warn("Attribute \u201C" 1375 + attributeName.getLocal(AttributeName.HTML) 1376 + "\u201D without an explicit value seen. The attribute may be dropped by IE7."); 1377 } 1378 // ]NOCPP] 1379 attributes.addAttribute(attributeName, 1380 Portability.newEmptyString() 1381 // [NOCPP[ 1382 , xmlnsPolicy 1383 // ]NOCPP] 1384 // CPPONLY: , attributeLine 1385 ); 1386 attributeName = null; 1387 } else { 1388 clearStrBufAfterUse(); 1389 } 1390 } 1391 1392 private void addAttributeWithValue() throws SAXException { 1393 // [NOCPP[ 1394 if (metaBoundaryPassed && ElementName.META == tagName 1395 && AttributeName.CHARSET == attributeName) { 1396 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 1024 bytes."); 1397 } 1398 // ]NOCPP] 1399 if (attributeName != null) { 1400 String val = strBufToString(); // Ownership transferred to 1401 // HtmlAttributes 1402 // CPPONLY: if (mViewSource) { 1403 // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val); 1404 // CPPONLY: } 1405 attributes.addAttribute(attributeName, val 1406 // [NOCPP[ 1407 , xmlnsPolicy 1408 // ]NOCPP] 1409 // CPPONLY: , attributeLine 1410 ); 1411 attributeName = null; 1412 } else { 1413 // We have a duplicate attribute. Explicitly discard its value. 1414 clearStrBufAfterUse(); 1415 } 1416 } 1417 1418 // [NOCPP[ 1419 1420 protected void startErrorReporting() throws SAXException { 1421 1422 } 1423 1424 // ]NOCPP] 1425 1426 public void start() throws SAXException { 1427 initializeWithoutStarting(); 1428 tokenHandler.startTokenization(this); 1429 // CPPONLY: if (mViewSource) { 1430 // CPPONLY: line = 1; 1431 // CPPONLY: col = -1; 1432 // CPPONLY: nextCharOnNewLine = false; 1433 // CPPONLY: } else if (tokenHandler.WantsLineAndColumn()) { 1434 // CPPONLY: line = 0; 1435 // CPPONLY: col = 1; 1436 // CPPONLY: nextCharOnNewLine = true; 1437 // CPPONLY: } else { 1438 // CPPONLY: line = -1; 1439 // CPPONLY: col = -1; 1440 // CPPONLY: nextCharOnNewLine = false; 1441 // CPPONLY: } 1442 // [NOCPP[ 1443 startErrorReporting(); 1444 // ]NOCPP] 1445 } 1446 1447 public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException { 1448 int state = stateSave; 1449 int returnState = returnStateSave; 1450 char c = '\u0000'; 1451 shouldSuspend = false; 1452 lastCR = false; 1453 1454 int start = buffer.getStart(); 1455 int end = buffer.getEnd(); 1456 1457 // In C++, the caller of tokenizeBuffer needs to do this explicitly. 1458 // [NOCPP[ 1459 ensureBufferSpace(end - start); 1460 // ]NOCPP] 1461 1462 /** 1463 * The index of the last <code>char</code> read from <code>buf</code>. 1464 */ 1465 int pos = start - 1; 1466 1467 switch (state) { 1468 case DATA: 1469 case RCDATA: 1470 case SCRIPT_DATA: 1471 case PLAINTEXT: 1472 case RAWTEXT: 1473 case CDATA_SECTION: 1474 case SCRIPT_DATA_ESCAPED: 1475 case SCRIPT_DATA_ESCAPE_START: 1476 case SCRIPT_DATA_ESCAPE_START_DASH: 1477 case SCRIPT_DATA_ESCAPED_DASH: 1478 case SCRIPT_DATA_ESCAPED_DASH_DASH: 1479 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 1480 case SCRIPT_DATA_DOUBLE_ESCAPED: 1481 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 1482 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 1483 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 1484 case SCRIPT_DATA_DOUBLE_ESCAPE_END: 1485 cstart = start; 1486 break; 1487 default: 1488 cstart = Integer.MAX_VALUE; 1489 break; 1490 } 1491 1492 // CPPONLY: if (mViewSource) { 1493 // CPPONLY: mViewSource.SetBuffer(buffer); 1494 // CPPONLY: if (htmlaccelEnabled()) { 1495 // CPPONLY: pos = StateLoopViewSourceSIMD(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1496 // CPPONLY: } else { 1497 // CPPONLY: pos = StateLoopViewSourceALU(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1498 // CPPONLY: } 1499 // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1); 1500 // CPPONLY: } else if (tokenHandler.WantsLineAndColumn()) { 1501 // CPPONLY: if (htmlaccelEnabled()) { 1502 // CPPONLY: pos = StateLoopLineColSIMD(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1503 // CPPONLY: } else { 1504 // CPPONLY: pos = StateLoopLineColALU(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1505 // CPPONLY: } 1506 // CPPONLY: } else if (htmlaccelEnabled()) { 1507 // CPPONLY: pos = StateLoopFastestSIMD(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1508 // CPPONLY: } else { 1509 // CPPONLY: pos = StateLoopFastestALU(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1510 // CPPONLY: } 1511 // [NOCPP[ 1512 pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, 1513 end); 1514 // ]NOCPP] 1515 if (pos == end) { 1516 // exiting due to end of buffer 1517 buffer.setStart(pos); 1518 } else { 1519 buffer.setStart(pos + 1); 1520 } 1521 return lastCR; 1522 } 1523 1524 // [NOCPP[ 1525 private void ensureBufferSpace(int inputLength) throws SAXException { 1526 // Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB. 1527 // Adding to the general worst case instead of only the 1528 // TreeBuilder-exposed worst case to avoid re-introducing a bug when 1529 // unifying the tokenizer and tree builder buffers in the future. 1530 int worstCase = strBufLen + inputLength + charRefBufLen + 2; 1531 tokenHandler.ensureBufferSpace(worstCase); 1532 if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) { 1533 // When altering infoset, if the comment contents are consecutive 1534 // hyphens, each hyphen generates a space, too. These buffer 1535 // contents never get emitted as characters() to the tokenHandler, 1536 // which is why this calculation happens after the call to 1537 // ensureBufferSpace on tokenHandler. 1538 worstCase *= 2; 1539 } 1540 if (strBuf == null) { 1541 // Add an arbitrary small value to avoid immediate reallocation 1542 // once there are a few characters in the buffer. 1543 strBuf = new char[worstCase + 128]; 1544 } else if (worstCase > strBuf.length) { 1545 // HotSpot reportedly allocates memory with 8-byte accuracy, so 1546 // there's no point in trying to do math here to avoid slop. 1547 // Maybe we should add some small constant to worstCase here 1548 // but not doing that without profiling. In C++ with jemalloc, 1549 // the corresponding method should do math to round up here 1550 // to avoid slop. 1551 char[] newBuf = new char[Math.max(worstCase, (strBuf.length*5)/4)]; 1552 System.arraycopy(strBuf, 0, newBuf, 0, strBufLen); 1553 strBuf = newBuf; 1554 } 1555 } 1556 // ]NOCPP] 1557 1558 @SuppressWarnings("unused") @Inline private int stateLoop(int state, char c, 1559 int pos, @NoLength char[] buf, boolean reconsume, int returnState, 1560 int endPos) throws SAXException { 1561 boolean reportedConsecutiveHyphens = false; 1562 /* 1563 * Idioms used in this code: 1564 * 1565 * 1566 * Consuming the next input character 1567 * 1568 * To consume the next input character, the code does this: if (++pos == 1569 * endPos) { break stateloop; } c = checkChar(buf, pos); 1570 * 1571 * 1572 * Staying in a state 1573 * 1574 * When there's a state that the tokenizer may stay in over multiple 1575 * input characters, the state has a wrapper |for(;;)| loop and staying 1576 * in the state continues the loop. 1577 * 1578 * 1579 * Switching to another state 1580 * 1581 * To switch to another state, the code sets the state variable to the 1582 * magic number of the new state. Then it either continues stateloop or 1583 * breaks out of the state's own wrapper loop if the target state is 1584 * right after the current state in source order. (This is a partial 1585 * workaround for Java's lack of goto.) 1586 * 1587 * 1588 * Reconsume support 1589 * 1590 * The spec sometimes says that an input character is reconsumed in 1591 * another state. If a state can ever be entered so that an input 1592 * character can be reconsumed in it, the state's code starts with an 1593 * |if (reconsume)| that sets reconsume to false and skips over the 1594 * normal code for consuming a new character. 1595 * 1596 * To reconsume the current character in another state, the code sets 1597 * |reconsume| to true and then switches to the other state. 1598 * 1599 * 1600 * Emitting character tokens 1601 * 1602 * This method emits character tokens lazily. Whenever a new range of 1603 * character tokens starts, the field cstart must be set to the start 1604 * index of the range. The flushChars() method must be called at the end 1605 * of a range to flush it. 1606 * 1607 * 1608 * U+0000 handling 1609 * 1610 * The various states have to handle the replacement of U+0000 with 1611 * U+FFFD. However, if U+0000 would be reconsumed in another state, the 1612 * replacement doesn't need to happen, because it's handled by the 1613 * reconsuming state. 1614 * 1615 * 1616 * LF handling 1617 * 1618 * Every state needs to increment the line number upon LF unless the LF 1619 * gets reconsumed by another state which increments the line number. 1620 * 1621 * 1622 * CR handling 1623 * 1624 * Every state needs to handle CR unless the CR gets reconsumed and is 1625 * handled by the reconsuming state. The CR needs to be handled as if it 1626 * were and LF, the lastCR field must be set to true and then this 1627 * method must return. The IO driver will then swallow the next 1628 * character if it is an LF to coalesce CRLF. 1629 */ 1630 stateloop: for (;;) { 1631 switch (state) { 1632 case DATA: 1633 dataloop: for (;;) { 1634 if (reconsume) { 1635 reconsume = false; 1636 } else { 1637 ++pos; 1638 // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today. 1639 // The line below advances pos by some number of code units that this state is indifferent to. 1640 // CPPONLY: pos += accelerateAdvancementData(buf, pos, endPos); 1641 if (pos == endPos) { 1642 break stateloop; 1643 } 1644 c = checkChar(buf, pos); 1645 } 1646 switch (c) { 1647 case '&': 1648 /* 1649 * U+0026 AMPERSAND (&) Switch to the character 1650 * reference in data state. 1651 */ 1652 flushChars(buf, pos); 1653 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 1654 appendCharRefBuf(c); 1655 setAdditionalAndRememberAmpersandLocation('\u0000'); 1656 returnState = state; 1657 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 1658 continue stateloop; 1659 case '<': 1660 /* 1661 * U+003C LESS-THAN SIGN (<) Switch to the tag 1662 * open state. 1663 */ 1664 flushChars(buf, pos); 1665 1666 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); 1667 // `break` optimizes; `continue stateloop;` would be valid 1668 break dataloop; 1669 case '\u0000': 1670 maybeEmitReplacementCharacter(buf, pos); 1671 continue; 1672 case '\r': 1673 emitCarriageReturn(buf, pos); 1674 break stateloop; 1675 case '\n': 1676 silentLineFeed(); 1677 // CPPONLY: MOZ_FALLTHROUGH; 1678 default: 1679 /* 1680 * Anything else Emit the input character as a 1681 * character token. 1682 * 1683 * Stay in the data state. 1684 */ 1685 continue; 1686 } 1687 } 1688 // CPPONLY: MOZ_FALLTHROUGH; 1689 case TAG_OPEN: 1690 tagopenloop: for (;;) { 1691 /* 1692 * The behavior of this state depends on the content 1693 * model flag. 1694 */ 1695 if (++pos == endPos) { 1696 break stateloop; 1697 } 1698 c = checkChar(buf, pos); 1699 /* 1700 * If the content model flag is set to the PCDATA state 1701 * Consume the next input character: 1702 */ 1703 if (c >= 'A' && c <= 'Z') { 1704 /* 1705 * U+0041 LATIN CAPITAL LETTER A through to U+005A 1706 * LATIN CAPITAL LETTER Z Create a new start tag 1707 * token, 1708 */ 1709 endTag = false; 1710 /* 1711 * set its tag name to the lowercase version of the 1712 * input character (add 0x0020 to the character's 1713 * code point), 1714 */ 1715 clearStrBufBeforeUse(); 1716 appendStrBuf((char) (c + 0x20)); 1717 containsHyphen = false; 1718 /* then switch to the tag name state. */ 1719 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1720 /* 1721 * (Don't emit the token yet; further details will 1722 * be filled in before it is emitted.) 1723 */ 1724 // `break` optimizes; `continue stateloop;` would be valid 1725 break tagopenloop; 1726 } else if (c >= 'a' && c <= 'z') { 1727 /* 1728 * U+0061 LATIN SMALL LETTER A through to U+007A 1729 * LATIN SMALL LETTER Z Create a new start tag 1730 * token, 1731 */ 1732 endTag = false; 1733 /* 1734 * set its tag name to the input character, 1735 */ 1736 clearStrBufBeforeUse(); 1737 appendStrBuf(c); 1738 containsHyphen = false; 1739 /* then switch to the tag name state. */ 1740 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1741 /* 1742 * (Don't emit the token yet; further details will 1743 * be filled in before it is emitted.) 1744 */ 1745 // `break` optimizes; `continue stateloop;` would be valid 1746 break tagopenloop; 1747 } 1748 switch (c) { 1749 case '!': 1750 /* 1751 * U+0021 EXCLAMATION MARK (!) Switch to the 1752 * markup declaration open state. 1753 */ 1754 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos); 1755 continue stateloop; 1756 case '/': 1757 /* 1758 * U+002F SOLIDUS (/) Switch to the close tag 1759 * open state. 1760 */ 1761 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos); 1762 continue stateloop; 1763 case '?': 1764 // CPPONLY: if (viewingXmlSource) { 1765 // CPPONLY: state = transition(state, 1766 // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION, 1767 // CPPONLY: reconsume, 1768 // CPPONLY: pos); 1769 // CPPONLY: continue stateloop; 1770 // CPPONLY: } 1771 /* 1772 * U+003F QUESTION MARK (?) Parse error. 1773 */ 1774 errProcessingInstruction(); 1775 /* 1776 * Switch to the bogus comment state. 1777 */ 1778 clearStrBufBeforeUse(); 1779 appendStrBuf(c); 1780 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1781 continue stateloop; 1782 case '>': 1783 /* 1784 * U+003E GREATER-THAN SIGN (>) Parse error. 1785 */ 1786 errLtGt(); 1787 /* 1788 * Emit a U+003C LESS-THAN SIGN character token 1789 * and a U+003E GREATER-THAN SIGN character 1790 * token. 1791 */ 1792 tokenHandler.characters(Tokenizer.LT_GT, 0, 2); 1793 /* Switch to the data state. */ 1794 cstart = pos + 1; 1795 state = transition(state, Tokenizer.DATA, reconsume, pos); 1796 continue stateloop; 1797 default: 1798 /* 1799 * Anything else Parse error. 1800 */ 1801 errBadCharAfterLt(c); 1802 /* 1803 * Emit a U+003C LESS-THAN SIGN character token 1804 */ 1805 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1806 /* 1807 * and reconsume the current input character in 1808 * the data state. 1809 */ 1810 cstart = pos; 1811 reconsume = true; 1812 state = transition(state, Tokenizer.DATA, reconsume, pos); 1813 continue stateloop; 1814 } 1815 } 1816 // CPPONLY: MOZ_FALLTHROUGH; 1817 case TAG_NAME: 1818 tagnameloop: for (;;) { 1819 if (++pos == endPos) { 1820 break stateloop; 1821 } 1822 c = checkChar(buf, pos); 1823 /* 1824 * Consume the next input character: 1825 */ 1826 switch (c) { 1827 case '\r': 1828 silentCarriageReturn(); 1829 strBufToElementNameString(); 1830 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1831 break stateloop; 1832 case '\n': 1833 silentLineFeed(); 1834 // CPPONLY: MOZ_FALLTHROUGH; 1835 case ' ': 1836 case '\t': 1837 case '\u000C': 1838 /* 1839 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1840 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1841 * Switch to the before attribute name state. 1842 */ 1843 strBufToElementNameString(); 1844 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1845 // `break` optimizes; `continue stateloop;` would be valid 1846 break tagnameloop; 1847 case '/': 1848 /* 1849 * U+002F SOLIDUS (/) Switch to the self-closing 1850 * start tag state. 1851 */ 1852 strBufToElementNameString(); 1853 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1854 continue stateloop; 1855 case '>': 1856 /* 1857 * U+003E GREATER-THAN SIGN (>) Emit the current 1858 * tag token. 1859 */ 1860 strBufToElementNameString(); 1861 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1862 if (shouldSuspend) { 1863 break stateloop; 1864 } 1865 /* 1866 * Switch to the data state. 1867 */ 1868 continue stateloop; 1869 case '\u0000': 1870 c = '\uFFFD'; 1871 // CPPONLY: MOZ_FALLTHROUGH; 1872 default: 1873 if (c >= 'A' && c <= 'Z') { 1874 /* 1875 * U+0041 LATIN CAPITAL LETTER A through to 1876 * U+005A LATIN CAPITAL LETTER Z Append the 1877 * lowercase version of the current input 1878 * character (add 0x0020 to the character's 1879 * code point) to the current tag token's 1880 * tag name. 1881 */ 1882 c += 0x20; 1883 } else if (c == '-') { 1884 containsHyphen = true; 1885 } 1886 /* 1887 * Anything else Append the current input 1888 * character to the current tag token's tag 1889 * name. 1890 */ 1891 appendStrBuf(c); 1892 /* 1893 * Stay in the tag name state. 1894 */ 1895 continue; 1896 } 1897 } 1898 // CPPONLY: MOZ_FALLTHROUGH; 1899 case BEFORE_ATTRIBUTE_NAME: 1900 beforeattributenameloop: for (;;) { 1901 if (reconsume) { 1902 reconsume = false; 1903 } else { 1904 if (++pos == endPos) { 1905 break stateloop; 1906 } 1907 c = checkChar(buf, pos); 1908 } 1909 /* 1910 * Consume the next input character: 1911 */ 1912 switch (c) { 1913 case '\r': 1914 silentCarriageReturn(); 1915 break stateloop; 1916 case '\n': 1917 silentLineFeed(); 1918 // CPPONLY: MOZ_FALLTHROUGH; 1919 case ' ': 1920 case '\t': 1921 case '\u000C': 1922 /* 1923 * U+0009 CHARACTER TABULATION U+000A LINE FEED 1924 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1925 * in the before attribute name state. 1926 */ 1927 continue; 1928 case '/': 1929 /* 1930 * U+002F SOLIDUS (/) Switch to the self-closing 1931 * start tag state. 1932 */ 1933 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1934 continue stateloop; 1935 case '>': 1936 /* 1937 * U+003E GREATER-THAN SIGN (>) Emit the current 1938 * tag token. 1939 */ 1940 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1941 if (shouldSuspend) { 1942 break stateloop; 1943 } 1944 /* 1945 * Switch to the data state. 1946 */ 1947 continue stateloop; 1948 case '\u0000': 1949 c = '\uFFFD'; 1950 // CPPONLY: MOZ_FALLTHROUGH; 1951 case '\"': 1952 case '\'': 1953 case '<': 1954 case '=': 1955 /* 1956 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 1957 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS 1958 * SIGN (=) Parse error. 1959 */ 1960 errBadCharBeforeAttributeNameOrNull(c); 1961 /* 1962 * Treat it as per the "anything else" entry 1963 * below. 1964 */ 1965 // CPPONLY: MOZ_FALLTHROUGH; 1966 default: 1967 /* 1968 * Anything else Start a new attribute in the 1969 * current tag token. 1970 */ 1971 if (c >= 'A' && c <= 'Z') { 1972 /* 1973 * U+0041 LATIN CAPITAL LETTER A through to 1974 * U+005A LATIN CAPITAL LETTER Z Set that 1975 * attribute's name to the lowercase version 1976 * of the current input character (add 1977 * 0x0020 to the character's code point) 1978 */ 1979 c += 0x20; 1980 } 1981 // CPPONLY: attributeLine = line; 1982 /* 1983 * Set that attribute's name to the current 1984 * input character, 1985 */ 1986 clearStrBufBeforeUse(); 1987 appendStrBuf(c); 1988 /* 1989 * and its value to the empty string. 1990 */ 1991 // Will do later. 1992 /* 1993 * Switch to the attribute name state. 1994 */ 1995 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); 1996 // `break` optimizes; `continue stateloop;` would be valid 1997 break beforeattributenameloop; 1998 } 1999 } 2000 // CPPONLY: MOZ_FALLTHROUGH; 2001 case ATTRIBUTE_NAME: 2002 attributenameloop: for (;;) { 2003 if (++pos == endPos) { 2004 break stateloop; 2005 } 2006 c = checkChar(buf, pos); 2007 /* 2008 * Consume the next input character: 2009 */ 2010 switch (c) { 2011 case '\r': 2012 silentCarriageReturn(); 2013 attributeNameComplete(); 2014 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); 2015 break stateloop; 2016 case '\n': 2017 silentLineFeed(); 2018 // CPPONLY: MOZ_FALLTHROUGH; 2019 case ' ': 2020 case '\t': 2021 case '\u000C': 2022 /* 2023 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2024 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 2025 * Switch to the after attribute name state. 2026 */ 2027 attributeNameComplete(); 2028 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); 2029 continue stateloop; 2030 case '/': 2031 /* 2032 * U+002F SOLIDUS (/) Switch to the self-closing 2033 * start tag state. 2034 */ 2035 attributeNameComplete(); 2036 addAttributeWithoutValue(); 2037 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 2038 continue stateloop; 2039 case '=': 2040 /* 2041 * U+003D EQUALS SIGN (=) Switch to the before 2042 * attribute value state. 2043 */ 2044 attributeNameComplete(); 2045 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); 2046 // `break` optimizes; `continue stateloop;` would be valid 2047 break attributenameloop; 2048 case '>': 2049 /* 2050 * U+003E GREATER-THAN SIGN (>) Emit the current 2051 * tag token. 2052 */ 2053 attributeNameComplete(); 2054 addAttributeWithoutValue(); 2055 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2056 if (shouldSuspend) { 2057 break stateloop; 2058 } 2059 /* 2060 * Switch to the data state. 2061 */ 2062 continue stateloop; 2063 case '\u0000': 2064 c = '\uFFFD'; 2065 // CPPONLY: MOZ_FALLTHROUGH; 2066 case '\"': 2067 case '\'': 2068 case '<': 2069 /* 2070 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 2071 * (') U+003C LESS-THAN SIGN (<) Parse error. 2072 */ 2073 errQuoteOrLtInAttributeNameOrNull(c); 2074 /* 2075 * Treat it as per the "anything else" entry 2076 * below. 2077 */ 2078 // CPPONLY: MOZ_FALLTHROUGH; 2079 default: 2080 if (c >= 'A' && c <= 'Z') { 2081 /* 2082 * U+0041 LATIN CAPITAL LETTER A through to 2083 * U+005A LATIN CAPITAL LETTER Z Append the 2084 * lowercase version of the current input 2085 * character (add 0x0020 to the character's 2086 * code point) to the current attribute's 2087 * name. 2088 */ 2089 c += 0x20; 2090 } 2091 /* 2092 * Anything else Append the current input 2093 * character to the current attribute's name. 2094 */ 2095 appendStrBuf(c); 2096 /* 2097 * Stay in the attribute name state. 2098 */ 2099 continue; 2100 } 2101 } 2102 // CPPONLY: MOZ_FALLTHROUGH; 2103 case BEFORE_ATTRIBUTE_VALUE: 2104 beforeattributevalueloop: for (;;) { 2105 if (++pos == endPos) { 2106 break stateloop; 2107 } 2108 c = checkChar(buf, pos); 2109 /* 2110 * Consume the next input character: 2111 */ 2112 switch (c) { 2113 case '\r': 2114 silentCarriageReturn(); 2115 break stateloop; 2116 case '\n': 2117 silentLineFeed(); 2118 // CPPONLY: MOZ_FALLTHROUGH; 2119 case ' ': 2120 case '\t': 2121 case '\u000C': 2122 /* 2123 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2124 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 2125 * in the before attribute value state. 2126 */ 2127 continue; 2128 case '"': 2129 /* 2130 * U+0022 QUOTATION MARK (") Switch to the 2131 * attribute value (double-quoted) state. 2132 */ 2133 // CPPONLY: attributeLine = line; 2134 clearStrBufBeforeUse(); 2135 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); 2136 // `break` optimizes; `continue stateloop;` would be valid 2137 break beforeattributevalueloop; 2138 case '&': 2139 /* 2140 * U+0026 AMPERSAND (&) Switch to the attribute 2141 * value (unquoted) state and reconsume this 2142 * input character. 2143 */ 2144 // CPPONLY: attributeLine = line; 2145 clearStrBufBeforeUse(); 2146 reconsume = true; 2147 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); 2148 noteUnquotedAttributeValue(); 2149 continue stateloop; 2150 case '\'': 2151 /* 2152 * U+0027 APOSTROPHE (') Switch to the attribute 2153 * value (single-quoted) state. 2154 */ 2155 // CPPONLY: attributeLine = line; 2156 clearStrBufBeforeUse(); 2157 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); 2158 continue stateloop; 2159 case '>': 2160 /* 2161 * U+003E GREATER-THAN SIGN (>) Parse error. 2162 */ 2163 errAttributeValueMissing(); 2164 /* 2165 * Emit the current tag token. 2166 */ 2167 addAttributeWithoutValue(); 2168 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2169 if (shouldSuspend) { 2170 break stateloop; 2171 } 2172 /* 2173 * Switch to the data state. 2174 */ 2175 continue stateloop; 2176 case '\u0000': 2177 c = '\uFFFD'; 2178 // CPPONLY: MOZ_FALLTHROUGH; 2179 case '<': 2180 case '=': 2181 case '`': 2182 /* 2183 * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN 2184 * (=) U+0060 GRAVE ACCENT (`) 2185 */ 2186 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c); 2187 /* 2188 * Treat it as per the "anything else" entry 2189 * below. 2190 */ 2191 // CPPONLY: MOZ_FALLTHROUGH; 2192 default: 2193 /* 2194 * Anything else Append the current input 2195 * character to the current attribute's value. 2196 */ 2197 // CPPONLY: attributeLine = line; 2198 clearStrBufBeforeUse(); 2199 appendStrBuf(c); 2200 /* 2201 * Switch to the attribute value (unquoted) 2202 * state. 2203 */ 2204 2205 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); 2206 noteUnquotedAttributeValue(); 2207 continue stateloop; 2208 } 2209 } 2210 // CPPONLY: MOZ_FALLTHROUGH; 2211 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 2212 attributevaluedoublequotedloop: for (;;) { 2213 if (reconsume) { 2214 reconsume = false; 2215 } else { 2216 ++pos; 2217 // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today. 2218 // The line below advances pos by some number of code units that this state is indifferent to. 2219 // CPPONLY: pos += accelerateAdvancementAttributeValueDoubleQuoted(buf, pos, endPos); 2220 if (pos == endPos) { 2221 break stateloop; 2222 } 2223 c = checkChar(buf, pos); 2224 } 2225 /* 2226 * Consume the next input character: 2227 */ 2228 switch (c) { 2229 case '"': 2230 /* 2231 * U+0022 QUOTATION MARK (") Switch to the after 2232 * attribute value (quoted) state. 2233 */ 2234 addAttributeWithValue(); 2235 2236 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); 2237 // `break` optimizes; `continue stateloop;` would be valid 2238 break attributevaluedoublequotedloop; 2239 case '&': 2240 /* 2241 * U+0026 AMPERSAND (&) Switch to the character 2242 * reference in attribute value state, with the 2243 * additional allowed character being U+0022 2244 * QUOTATION MARK ("). 2245 */ 2246 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 2247 appendCharRefBuf(c); 2248 setAdditionalAndRememberAmpersandLocation('\"'); 2249 returnState = state; 2250 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 2251 continue stateloop; 2252 case '\r': 2253 appendStrBufCarriageReturn(); 2254 break stateloop; 2255 case '\n': 2256 appendStrBufLineFeed(); 2257 continue; 2258 case '\u0000': 2259 c = '\uFFFD'; 2260 // CPPONLY: MOZ_FALLTHROUGH; 2261 default: 2262 /* 2263 * Anything else Append the current input 2264 * character to the current attribute's value. 2265 */ 2266 appendStrBuf(c); 2267 /* 2268 * Stay in the attribute value (double-quoted) 2269 * state. 2270 */ 2271 continue; 2272 } 2273 } 2274 // CPPONLY: MOZ_FALLTHROUGH; 2275 case AFTER_ATTRIBUTE_VALUE_QUOTED: 2276 afterattributevaluequotedloop: for (;;) { 2277 if (++pos == endPos) { 2278 break stateloop; 2279 } 2280 c = checkChar(buf, pos); 2281 /* 2282 * Consume the next input character: 2283 */ 2284 switch (c) { 2285 case '\r': 2286 silentCarriageReturn(); 2287 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2288 break stateloop; 2289 case '\n': 2290 silentLineFeed(); 2291 // CPPONLY: MOZ_FALLTHROUGH; 2292 case ' ': 2293 case '\t': 2294 case '\u000C': 2295 /* 2296 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2297 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 2298 * Switch to the before attribute name state. 2299 */ 2300 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2301 continue stateloop; 2302 case '/': 2303 /* 2304 * U+002F SOLIDUS (/) Switch to the self-closing 2305 * start tag state. 2306 */ 2307 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 2308 // `break` optimizes; `continue stateloop;` would be valid 2309 break afterattributevaluequotedloop; 2310 case '>': 2311 /* 2312 * U+003E GREATER-THAN SIGN (>) Emit the current 2313 * tag token. 2314 */ 2315 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2316 if (shouldSuspend) { 2317 break stateloop; 2318 } 2319 /* 2320 * Switch to the data state. 2321 */ 2322 continue stateloop; 2323 default: 2324 /* 2325 * Anything else Parse error. 2326 */ 2327 errNoSpaceBetweenAttributes(); 2328 /* 2329 * Reconsume the character in the before 2330 * attribute name state. 2331 */ 2332 reconsume = true; 2333 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2334 continue stateloop; 2335 } 2336 } 2337 // CPPONLY: MOZ_FALLTHROUGH; 2338 case SELF_CLOSING_START_TAG: 2339 if (++pos == endPos) { 2340 break stateloop; 2341 } 2342 c = checkChar(buf, pos); 2343 /* 2344 * Consume the next input character: 2345 */ 2346 switch (c) { 2347 case '>': 2348 /* 2349 * U+003E GREATER-THAN SIGN (>) Set the self-closing 2350 * flag of the current tag token. Emit the current 2351 * tag token. 2352 */ 2353 state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos); 2354 if (shouldSuspend) { 2355 break stateloop; 2356 } 2357 /* 2358 * Switch to the data state. 2359 */ 2360 continue stateloop; 2361 default: 2362 /* Anything else Parse error. */ 2363 errSlashNotFollowedByGt(); 2364 /* 2365 * Reconsume the character in the before attribute 2366 * name state. 2367 */ 2368 reconsume = true; 2369 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2370 continue stateloop; 2371 } 2372 // no fallthrough, reordering opportunity 2373 case ATTRIBUTE_VALUE_UNQUOTED: 2374 for (;;) { 2375 if (reconsume) { 2376 reconsume = false; 2377 } else { 2378 if (++pos == endPos) { 2379 break stateloop; 2380 } 2381 c = checkChar(buf, pos); 2382 } 2383 /* 2384 * Consume the next input character: 2385 */ 2386 switch (c) { 2387 case '\r': 2388 silentCarriageReturn(); 2389 addAttributeWithValue(); 2390 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2391 break stateloop; 2392 case '\n': 2393 silentLineFeed(); 2394 // CPPONLY: MOZ_FALLTHROUGH; 2395 case ' ': 2396 case '\t': 2397 case '\u000C': 2398 /* 2399 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2400 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 2401 * Switch to the before attribute name state. 2402 */ 2403 addAttributeWithValue(); 2404 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 2405 continue stateloop; 2406 case '&': 2407 /* 2408 * U+0026 AMPERSAND (&) Switch to the character 2409 * reference in attribute value state, with the 2410 * additional allowed character being U+003E 2411 * GREATER-THAN SIGN (>) 2412 */ 2413 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 2414 appendCharRefBuf(c); 2415 setAdditionalAndRememberAmpersandLocation('>'); 2416 returnState = state; 2417 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 2418 continue stateloop; 2419 case '>': 2420 /* 2421 * U+003E GREATER-THAN SIGN (>) Emit the current 2422 * tag token. 2423 */ 2424 addAttributeWithValue(); 2425 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2426 if (shouldSuspend) { 2427 break stateloop; 2428 } 2429 /* 2430 * Switch to the data state. 2431 */ 2432 continue stateloop; 2433 case '\u0000': 2434 c = '\uFFFD'; 2435 // CPPONLY: MOZ_FALLTHROUGH; 2436 case '<': 2437 case '\"': 2438 case '\'': 2439 case '=': 2440 case '`': 2441 /* 2442 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 2443 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS 2444 * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error. 2445 */ 2446 errUnquotedAttributeValOrNull(c); 2447 /* 2448 * Treat it as per the "anything else" entry 2449 * below. 2450 */ 2451 // CPPONLY: MOZ_FALLTHROUGH; 2452 default: 2453 /* 2454 * Anything else Append the current input 2455 * character to the current attribute's value. 2456 */ 2457 appendStrBuf(c); 2458 /* 2459 * Stay in the attribute value (unquoted) state. 2460 */ 2461 continue; 2462 } 2463 } 2464 // no fallthrough, reordering opportunity 2465 case AFTER_ATTRIBUTE_NAME: 2466 for (;;) { 2467 if (++pos == endPos) { 2468 break stateloop; 2469 } 2470 c = checkChar(buf, pos); 2471 /* 2472 * Consume the next input character: 2473 */ 2474 switch (c) { 2475 case '\r': 2476 silentCarriageReturn(); 2477 break stateloop; 2478 case '\n': 2479 silentLineFeed(); 2480 // CPPONLY: MOZ_FALLTHROUGH; 2481 case ' ': 2482 case '\t': 2483 case '\u000C': 2484 /* 2485 * U+0009 CHARACTER TABULATION U+000A LINE FEED 2486 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 2487 * in the after attribute name state. 2488 */ 2489 continue; 2490 case '/': 2491 /* 2492 * U+002F SOLIDUS (/) Switch to the self-closing 2493 * start tag state. 2494 */ 2495 addAttributeWithoutValue(); 2496 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 2497 continue stateloop; 2498 case '=': 2499 /* 2500 * U+003D EQUALS SIGN (=) Switch to the before 2501 * attribute value state. 2502 */ 2503 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); 2504 continue stateloop; 2505 case '>': 2506 /* 2507 * U+003E GREATER-THAN SIGN (>) Emit the current 2508 * tag token. 2509 */ 2510 addAttributeWithoutValue(); 2511 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 2512 if (shouldSuspend) { 2513 break stateloop; 2514 } 2515 /* 2516 * Switch to the data state. 2517 */ 2518 continue stateloop; 2519 case '\u0000': 2520 c = '\uFFFD'; 2521 // CPPONLY: MOZ_FALLTHROUGH; 2522 case '\"': 2523 case '\'': 2524 case '<': 2525 errQuoteOrLtInAttributeNameOrNull(c); 2526 /* 2527 * Treat it as per the "anything else" entry 2528 * below. 2529 */ 2530 // CPPONLY: MOZ_FALLTHROUGH; 2531 default: 2532 addAttributeWithoutValue(); 2533 /* 2534 * Anything else Start a new attribute in the 2535 * current tag token. 2536 */ 2537 if (c >= 'A' && c <= 'Z') { 2538 /* 2539 * U+0041 LATIN CAPITAL LETTER A through to 2540 * U+005A LATIN CAPITAL LETTER Z Set that 2541 * attribute's name to the lowercase version 2542 * of the current input character (add 2543 * 0x0020 to the character's code point) 2544 */ 2545 c += 0x20; 2546 } 2547 /* 2548 * Set that attribute's name to the current 2549 * input character, 2550 */ 2551 clearStrBufBeforeUse(); 2552 appendStrBuf(c); 2553 /* 2554 * and its value to the empty string. 2555 */ 2556 // Will do later. 2557 /* 2558 * Switch to the attribute name state. 2559 */ 2560 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); 2561 continue stateloop; 2562 } 2563 } 2564 // no fallthrough, reordering opportunity 2565 case MARKUP_DECLARATION_OPEN: 2566 markupdeclarationopenloop: for (;;) { 2567 if (++pos == endPos) { 2568 break stateloop; 2569 } 2570 c = checkChar(buf, pos); 2571 /* 2572 * If the next two characters are both U+002D 2573 * HYPHEN-MINUS characters (-), consume those two 2574 * characters, create a comment token whose data is the 2575 * empty string, and switch to the comment start state. 2576 * 2577 * Otherwise, if the next seven characters are an ASCII 2578 * case-insensitive match for the word "DOCTYPE", then 2579 * consume those characters and switch to the DOCTYPE 2580 * state. 2581 * 2582 * Otherwise, if the insertion mode is 2583 * "in foreign content" and the current node is not an 2584 * element in the HTML namespace and the next seven 2585 * characters are an case-sensitive match for the string 2586 * "[CDATA[" (the five uppercase letters "CDATA" with a 2587 * U+005B LEFT SQUARE BRACKET character before and 2588 * after), then consume those characters and switch to 2589 * the CDATA section state. 2590 * 2591 * Otherwise, is is a parse error. Switch to the bogus 2592 * comment state. The next character that is consumed, 2593 * if any, is the first character that will be in the 2594 * comment. 2595 */ 2596 switch (c) { 2597 case '-': 2598 clearStrBufBeforeUse(); 2599 appendStrBuf(c); 2600 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); 2601 // `break` optimizes; `continue stateloop;` would be valid 2602 break markupdeclarationopenloop; 2603 case 'd': 2604 case 'D': 2605 clearStrBufBeforeUse(); 2606 appendStrBuf(c); 2607 index = 0; 2608 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos); 2609 continue stateloop; 2610 case '[': 2611 if (tokenHandler.cdataSectionAllowed()) { 2612 clearStrBufBeforeUse(); 2613 appendStrBuf(c); 2614 index = 0; 2615 state = transition(state, Tokenizer.CDATA_START, reconsume, pos); 2616 continue stateloop; 2617 } 2618 // CPPONLY: MOZ_FALLTHROUGH; 2619 default: 2620 errBogusComment(); 2621 clearStrBufBeforeUse(); 2622 reconsume = true; 2623 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2624 continue stateloop; 2625 } 2626 } 2627 // CPPONLY: MOZ_FALLTHROUGH; 2628 case MARKUP_DECLARATION_HYPHEN: 2629 markupdeclarationhyphenloop: for (;;) { 2630 if (++pos == endPos) { 2631 break stateloop; 2632 } 2633 c = checkChar(buf, pos); 2634 switch (c) { 2635 case '-': 2636 clearStrBufAfterOneHyphen(); 2637 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos); 2638 // `break` optimizes; `continue stateloop;` would be valid 2639 break markupdeclarationhyphenloop; 2640 default: 2641 errBogusComment(); 2642 reconsume = true; 2643 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 2644 continue stateloop; 2645 } 2646 } 2647 // CPPONLY: MOZ_FALLTHROUGH; 2648 case COMMENT_START: 2649 reportedConsecutiveHyphens = false; 2650 commentstartloop: for (;;) { 2651 if (++pos == endPos) { 2652 break stateloop; 2653 } 2654 c = checkChar(buf, pos); 2655 /* 2656 * Comment start state 2657 * 2658 * 2659 * Consume the next input character: 2660 */ 2661 switch (c) { 2662 case '-': 2663 /* 2664 * U+002D HYPHEN-MINUS (-) Switch to the comment 2665 * start dash state. 2666 */ 2667 appendStrBuf(c); 2668 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos); 2669 continue stateloop; 2670 case '>': 2671 /* 2672 * U+003E GREATER-THAN SIGN (>) Parse error. 2673 */ 2674 errPrematureEndOfComment(); 2675 /* Emit the comment token. */ 2676 emitComment(0, pos); 2677 /* 2678 * Switch to the data state. 2679 */ 2680 state = transition(state, Tokenizer.DATA, reconsume, pos); 2681 if (shouldSuspend) { 2682 break stateloop; 2683 } 2684 continue stateloop; 2685 case '<': 2686 appendStrBuf(c); 2687 state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos); 2688 continue stateloop; 2689 case '\r': 2690 appendStrBufCarriageReturn(); 2691 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2692 break stateloop; 2693 case '\n': 2694 appendStrBufLineFeed(); 2695 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2696 break commentstartloop; 2697 case '\u0000': 2698 c = '\uFFFD'; 2699 // CPPONLY: MOZ_FALLTHROUGH; 2700 default: 2701 /* 2702 * Anything else Append the input character to 2703 * the comment token's data. 2704 */ 2705 appendStrBuf(c); 2706 /* 2707 * Switch to the comment state. 2708 */ 2709 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2710 // `break` optimizes; `continue stateloop;` would be valid 2711 break commentstartloop; 2712 } 2713 } 2714 // CPPONLY: MOZ_FALLTHROUGH; 2715 case COMMENT: 2716 commentloop: for (;;) { 2717 ++pos; 2718 // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today. 2719 // The line below advances pos by some number of code units that this state is indifferent to. 2720 // CPPONLY: pos += accelerateAdvancementComment(buf, pos, endPos); 2721 if (pos == endPos) { 2722 break stateloop; 2723 } 2724 c = checkChar(buf, pos); 2725 /* 2726 * Comment state Consume the next input character: 2727 */ 2728 switch (c) { 2729 case '-': 2730 /* 2731 * U+002D HYPHEN-MINUS (-) Switch to the comment 2732 * end dash state 2733 */ 2734 appendStrBuf(c); 2735 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 2736 // `break` optimizes; `continue stateloop;` would be valid 2737 break commentloop; 2738 case '<': 2739 appendStrBuf(c); 2740 state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos); 2741 continue stateloop; 2742 case '\r': 2743 appendStrBufCarriageReturn(); 2744 break stateloop; 2745 case '\n': 2746 appendStrBufLineFeed(); 2747 continue; 2748 case '\u0000': 2749 c = '\uFFFD'; 2750 // CPPONLY: MOZ_FALLTHROUGH; 2751 default: 2752 /* 2753 * Anything else Append the input character to 2754 * the comment token's data. 2755 */ 2756 appendStrBuf(c); 2757 /* 2758 * Stay in the comment state. 2759 */ 2760 continue; 2761 } 2762 } 2763 // CPPONLY: MOZ_FALLTHROUGH; 2764 case COMMENT_END_DASH: 2765 commentenddashloop: for (;;) { 2766 if (++pos == endPos) { 2767 break stateloop; 2768 } 2769 c = checkChar(buf, pos); 2770 /* 2771 * Comment end dash state Consume the next input 2772 * character: 2773 */ 2774 switch (c) { 2775 case '-': 2776 /* 2777 * U+002D HYPHEN-MINUS (-) Switch to the comment 2778 * end state 2779 */ 2780 appendStrBuf(c); 2781 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); 2782 // `break` optimizes; `continue stateloop;` would be valid 2783 break commentenddashloop; 2784 case '<': 2785 appendStrBuf(c); 2786 state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos); 2787 continue stateloop; 2788 case '\r': 2789 appendStrBufCarriageReturn(); 2790 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2791 break stateloop; 2792 case '\n': 2793 appendStrBufLineFeed(); 2794 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2795 continue stateloop; 2796 case '\u0000': 2797 c = '\uFFFD'; 2798 // CPPONLY: MOZ_FALLTHROUGH; 2799 default: 2800 /* 2801 * Anything else Append a U+002D HYPHEN-MINUS 2802 * (-) character and the input character to the 2803 * comment token's data. 2804 */ 2805 appendStrBuf(c); 2806 /* 2807 * Switch to the comment state. 2808 */ 2809 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2810 continue stateloop; 2811 } 2812 } 2813 // CPPONLY: MOZ_FALLTHROUGH; 2814 case COMMENT_END: 2815 commentendloop: for (;;) { 2816 if (++pos == endPos) { 2817 break stateloop; 2818 } 2819 c = checkChar(buf, pos); 2820 /* 2821 * Comment end dash state Consume the next input 2822 * character: 2823 */ 2824 switch (c) { 2825 case '>': 2826 /* 2827 * U+003E GREATER-THAN SIGN (>) Emit the comment 2828 * token. 2829 */ 2830 emitComment(2, pos); 2831 /* 2832 * Switch to the data state. 2833 */ 2834 state = transition(state, Tokenizer.DATA, reconsume, pos); 2835 if (shouldSuspend) { 2836 break stateloop; 2837 } 2838 continue stateloop; 2839 case '-': 2840 /* U+002D HYPHEN-MINUS (-) Parse error. */ 2841 /* 2842 * Append a U+002D HYPHEN-MINUS (-) character to 2843 * the comment token's data. 2844 */ 2845 adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens); 2846 reportedConsecutiveHyphens = true; 2847 /* 2848 * Stay in the comment end state. 2849 */ 2850 continue; 2851 case '<': 2852 appendStrBuf(c); 2853 state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos); 2854 continue stateloop; 2855 case '\r': 2856 adjustDoubleHyphenAndAppendToStrBufCarriageReturn(); 2857 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2858 break stateloop; 2859 case '\n': 2860 adjustDoubleHyphenAndAppendToStrBufLineFeed(); 2861 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2862 continue stateloop; 2863 case '!': 2864 appendStrBuf(c); 2865 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); 2866 // `break` optimizes; `continue stateloop;` would be valid 2867 break commentendloop; 2868 case '\u0000': 2869 c = '\uFFFD'; 2870 // CPPONLY: MOZ_FALLTHROUGH; 2871 default: 2872 /* 2873 * Append two U+002D HYPHEN-MINUS (-) characters 2874 * and the input character to the comment 2875 * token's data. 2876 */ 2877 adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens); 2878 reportedConsecutiveHyphens = true; 2879 /* 2880 * Switch to the comment state. 2881 */ 2882 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2883 continue stateloop; 2884 } 2885 } 2886 // CPPONLY: MOZ_FALLTHROUGH; 2887 case COMMENT_END_BANG: 2888 for (;;) { 2889 if (++pos == endPos) { 2890 break stateloop; 2891 } 2892 c = checkChar(buf, pos); 2893 /* 2894 * Comment end bang state 2895 * 2896 * Consume the next input character: 2897 */ 2898 switch (c) { 2899 case '>': 2900 /* 2901 * U+003E GREATER-THAN SIGN (>) Emit the comment 2902 * token. 2903 */ 2904 emitComment(3, pos); 2905 /* 2906 * Switch to the data state. 2907 */ 2908 state = transition(state, Tokenizer.DATA, reconsume, pos); 2909 if (shouldSuspend) { 2910 break stateloop; 2911 } 2912 continue stateloop; 2913 case '-': 2914 /* 2915 * Append two U+002D HYPHEN-MINUS (-) characters 2916 * and a U+0021 EXCLAMATION MARK (!) character 2917 * to the comment token's data. 2918 */ 2919 appendStrBuf(c); 2920 /* 2921 * Switch to the comment end dash state. 2922 */ 2923 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 2924 continue stateloop; 2925 case '\r': 2926 appendStrBufCarriageReturn(); 2927 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2928 break stateloop; 2929 case '\n': 2930 appendStrBufLineFeed(); 2931 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2932 continue stateloop; 2933 case '\u0000': 2934 c = '\uFFFD'; 2935 // CPPONLY: MOZ_FALLTHROUGH; 2936 default: 2937 /* 2938 * Anything else Append two U+002D HYPHEN-MINUS 2939 * (-) characters, a U+0021 EXCLAMATION MARK (!) 2940 * character, and the input character to the 2941 * comment token's data. Switch to the comment 2942 * state. 2943 */ 2944 appendStrBuf(c); 2945 /* 2946 * Switch to the comment state. 2947 */ 2948 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2949 continue stateloop; 2950 } 2951 } 2952 // no fallthrough, reordering opportunity 2953 case COMMENT_LESSTHAN: 2954 commentlessthanloop: for (;;) { 2955 if (++pos == endPos) { 2956 break stateloop; 2957 } 2958 c = checkChar(buf, pos); 2959 switch (c) { 2960 case '!': 2961 appendStrBuf(c); 2962 state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG, reconsume, pos); 2963 // `break` optimizes; `continue stateloop;` would be valid 2964 break commentlessthanloop; 2965 case '<': 2966 appendStrBuf(c); 2967 continue; 2968 case '-': 2969 appendStrBuf(c); 2970 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 2971 continue stateloop; 2972 case '\r': 2973 appendStrBufCarriageReturn(); 2974 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2975 break stateloop; 2976 case '\n': 2977 appendStrBufLineFeed(); 2978 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2979 continue stateloop; 2980 case '\u0000': 2981 c = '\uFFFD'; 2982 // CPPONLY: MOZ_FALLTHROUGH; 2983 default: 2984 appendStrBuf(c); 2985 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 2986 continue stateloop; 2987 } 2988 } 2989 // CPPONLY: MOZ_FALLTHROUGH; 2990 case COMMENT_LESSTHAN_BANG: 2991 commentlessthanbangloop: for (;;) { 2992 if (++pos == endPos) { 2993 break stateloop; 2994 } 2995 c = checkChar(buf, pos); 2996 switch (c) { 2997 case '-': 2998 appendStrBuf(c); 2999 state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG_DASH, reconsume, pos); 3000 // `break` optimizes; `continue stateloop;` would be valid 3001 break commentlessthanbangloop; 3002 case '<': 3003 appendStrBuf(c); 3004 state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos); 3005 continue stateloop; 3006 case '\r': 3007 appendStrBufCarriageReturn(); 3008 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 3009 break stateloop; 3010 case '\n': 3011 appendStrBufLineFeed(); 3012 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 3013 continue stateloop; 3014 case '\u0000': 3015 c = '\uFFFD'; 3016 // CPPONLY: MOZ_FALLTHROUGH; 3017 default: 3018 appendStrBuf(c); 3019 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 3020 continue stateloop; 3021 } 3022 } 3023 // CPPONLY: MOZ_FALLTHROUGH; 3024 case COMMENT_LESSTHAN_BANG_DASH: 3025 if (++pos == endPos) { 3026 break stateloop; 3027 } 3028 c = checkChar(buf, pos); 3029 switch (c) { 3030 case '-': 3031 appendStrBuf(c); 3032 state = transition(state, 3033 Tokenizer.COMMENT_LESSTHAN_BANG_DASH_DASH, 3034 reconsume, pos); 3035 // `break` optimizes; `continue stateloop;` would be valid 3036 break; 3037 case '<': 3038 appendStrBuf(c); 3039 state = transition(state, 3040 Tokenizer.COMMENT_LESSTHAN, reconsume, pos); 3041 continue stateloop; 3042 case '\r': 3043 appendStrBufCarriageReturn(); 3044 state = transition(state, Tokenizer.COMMENT, 3045 reconsume, pos); 3046 break stateloop; 3047 case '\n': 3048 appendStrBufLineFeed(); 3049 state = transition(state, Tokenizer.COMMENT, 3050 reconsume, pos); 3051 continue stateloop; 3052 case '\u0000': 3053 c = '\uFFFD'; 3054 // CPPONLY: MOZ_FALLTHROUGH; 3055 default: 3056 appendStrBuf(c); 3057 state = transition(state, Tokenizer.COMMENT, 3058 reconsume, pos); 3059 continue stateloop; 3060 } 3061 // CPPONLY: MOZ_FALLTHROUGH; 3062 case COMMENT_LESSTHAN_BANG_DASH_DASH: 3063 if (++pos == endPos) { 3064 break stateloop; 3065 } 3066 c = checkChar(buf, pos); 3067 switch (c) { 3068 case '>': 3069 appendStrBuf(c); 3070 emitComment(3, pos); 3071 state = transition(state, Tokenizer.DATA, reconsume, 3072 pos); 3073 if (shouldSuspend) { 3074 break stateloop; 3075 } 3076 continue stateloop; 3077 case '-': 3078 errNestedComment(); 3079 adjustDoubleHyphenAndAppendToStrBufAndErr(c, 3080 reportedConsecutiveHyphens); 3081 reportedConsecutiveHyphens = true; 3082 state = transition(state, Tokenizer.COMMENT_END, 3083 reconsume, pos); 3084 continue stateloop; 3085 case '\r': 3086 c = '\n'; 3087 silentCarriageReturn(); 3088 errNestedComment(); 3089 adjustDoubleHyphenAndAppendToStrBufAndErr(c, 3090 reportedConsecutiveHyphens); 3091 reportedConsecutiveHyphens = true; 3092 state = transition(state, Tokenizer.COMMENT, 3093 reconsume, pos); 3094 break stateloop; 3095 case '\n': 3096 silentLineFeed(); 3097 errNestedComment(); 3098 adjustDoubleHyphenAndAppendToStrBufAndErr(c, 3099 reportedConsecutiveHyphens); 3100 reportedConsecutiveHyphens = true; 3101 state = transition(state, Tokenizer.COMMENT, 3102 reconsume, pos); 3103 continue stateloop; 3104 case '!': 3105 errNestedComment(); 3106 adjustDoubleHyphenAndAppendToStrBufAndErr(c, 3107 reportedConsecutiveHyphens); 3108 reportedConsecutiveHyphens = true; 3109 state = transition(state, 3110 Tokenizer.COMMENT_END_BANG, reconsume, pos); 3111 continue stateloop; 3112 case '\u0000': 3113 c = '\uFFFD'; 3114 // CPPONLY: MOZ_FALLTHROUGH; 3115 default: 3116 errNestedComment(); 3117 adjustDoubleHyphenAndAppendToStrBufAndErr(c, 3118 reportedConsecutiveHyphens); 3119 reportedConsecutiveHyphens = true; 3120 state = transition(state, Tokenizer.COMMENT, 3121 reconsume, pos); 3122 continue stateloop; 3123 } 3124 // no fallthrough, reordering opportunity 3125 case COMMENT_START_DASH: 3126 if (++pos == endPos) { 3127 break stateloop; 3128 } 3129 c = checkChar(buf, pos); 3130 /* 3131 * Comment start dash state 3132 * 3133 * Consume the next input character: 3134 */ 3135 switch (c) { 3136 case '-': 3137 /* 3138 * U+002D HYPHEN-MINUS (-) Switch to the comment end 3139 * state 3140 */ 3141 appendStrBuf(c); 3142 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); 3143 continue stateloop; 3144 case '>': 3145 errPrematureEndOfComment(); 3146 /* Emit the comment token. */ 3147 emitComment(1, pos); 3148 /* 3149 * Switch to the data state. 3150 */ 3151 state = transition(state, Tokenizer.DATA, reconsume, pos); 3152 if (shouldSuspend) { 3153 break stateloop; 3154 } 3155 continue stateloop; 3156 case '<': 3157 appendStrBuf(c); 3158 state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos); 3159 continue stateloop; 3160 case '\r': 3161 appendStrBufCarriageReturn(); 3162 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 3163 break stateloop; 3164 case '\n': 3165 appendStrBufLineFeed(); 3166 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 3167 continue stateloop; 3168 case '\u0000': 3169 c = '\uFFFD'; 3170 // CPPONLY: MOZ_FALLTHROUGH; 3171 default: 3172 /* 3173 * Append a U+002D HYPHEN-MINUS character (-) and 3174 * the current input character to the comment 3175 * token's data. 3176 */ 3177 appendStrBuf(c); 3178 /* 3179 * Switch to the comment state. 3180 */ 3181 state = transition(state, Tokenizer.COMMENT, reconsume, pos); 3182 continue stateloop; 3183 } 3184 // no fallthrough, reordering opportunity 3185 case CDATA_START: 3186 for (;;) { 3187 if (++pos == endPos) { 3188 break stateloop; 3189 } 3190 c = checkChar(buf, pos); 3191 if (index < 6) { // CDATA_LSQB.length 3192 if (c == Tokenizer.CDATA_LSQB[index]) { 3193 appendStrBuf(c); 3194 } else { 3195 errBogusComment(); 3196 reconsume = true; 3197 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3198 continue stateloop; 3199 } 3200 index++; 3201 continue; 3202 } else { 3203 clearStrBufAfterUse(); 3204 cstart = pos; // start coalescing 3205 reconsume = true; 3206 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 3207 // `break` optimizes; `continue stateloop;` would be valid 3208 break; 3209 } 3210 } 3211 // CPPONLY: MOZ_FALLTHROUGH; 3212 case CDATA_SECTION: 3213 cdatasectionloop: for (;;) { 3214 if (reconsume) { 3215 reconsume = false; 3216 } else { 3217 ++pos; 3218 // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today. 3219 // The line below advances pos by some number of code units that this state is indifferent to. 3220 // CPPONLY: pos += accelerateAdvancementCdataSection(buf, pos, endPos); 3221 if (pos == endPos) { 3222 break stateloop; 3223 } 3224 c = checkChar(buf, pos); 3225 } 3226 switch (c) { 3227 case ']': 3228 flushChars(buf, pos); 3229 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); 3230 // `break` optimizes; `continue stateloop;` would be valid 3231 break cdatasectionloop; 3232 case '\u0000': 3233 maybeEmitReplacementCharacter(buf, pos); 3234 continue; 3235 case '\r': 3236 emitCarriageReturn(buf, pos); 3237 break stateloop; 3238 case '\n': 3239 silentLineFeed(); 3240 // CPPONLY: MOZ_FALLTHROUGH; 3241 default: 3242 continue; 3243 } 3244 } 3245 // CPPONLY: MOZ_FALLTHROUGH; 3246 case CDATA_RSQB: 3247 if (++pos == endPos) { 3248 break stateloop; 3249 } 3250 c = checkChar(buf, pos); 3251 switch (c) { 3252 case ']': 3253 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, 3254 reconsume, pos); 3255 // `break` optimizes; `continue stateloop;` would be valid 3256 break; 3257 default: 3258 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); 3259 cstart = pos; 3260 reconsume = true; 3261 state = transition(state, Tokenizer.CDATA_SECTION, 3262 reconsume, pos); 3263 continue stateloop; 3264 } 3265 // CPPONLY: MOZ_FALLTHROUGH; 3266 case CDATA_RSQB_RSQB: 3267 cdatarsqbrsqb: for (;;) { 3268 if (++pos == endPos) { 3269 break stateloop; 3270 } 3271 c = checkChar(buf, pos); 3272 switch (c) { 3273 case ']': 3274 // Saw a third ]. Emit one ] (logically the 3275 // first one) and stay in this state to 3276 // remember that the last two characters seen 3277 // have been ]]. 3278 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); 3279 continue; 3280 case '>': 3281 cstart = pos + 1; 3282 state = transition(state, Tokenizer.DATA, reconsume, pos); 3283 // Since a CDATA section starts with a less-than sign, it 3284 // participates in the suspension-after-current-token 3285 // behavior. (The suspension can be requested when the 3286 // less-than sign has been seen but we don't yet know the 3287 // resulting token type.) Therefore, we need to deal with 3288 // a potential request here. 3289 suspendIfRequestedAfterCurrentNonTextToken(); 3290 if (shouldSuspend) { 3291 break stateloop; 3292 } 3293 continue stateloop; 3294 default: 3295 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); 3296 cstart = pos; 3297 reconsume = true; 3298 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 3299 continue stateloop; 3300 } 3301 } 3302 // no fallthrough, reordering opportunity 3303 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 3304 attributevaluesinglequotedloop: for (;;) { 3305 if (reconsume) { 3306 reconsume = false; 3307 } else { 3308 ++pos; 3309 // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today. 3310 // The line below advances pos by some number of code units that this state is indifferent to. 3311 // CPPONLY: pos += accelerateAdvancementAttributeValueSingleQuoted(buf, pos, endPos); 3312 if (pos == endPos) { 3313 break stateloop; 3314 } 3315 c = checkChar(buf, pos); 3316 } 3317 /* 3318 * Consume the next input character: 3319 */ 3320 switch (c) { 3321 case '\'': 3322 /* 3323 * U+0027 APOSTROPHE (') Switch to the after 3324 * attribute value (quoted) state. 3325 */ 3326 addAttributeWithValue(); 3327 3328 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); 3329 continue stateloop; 3330 case '&': 3331 /* 3332 * U+0026 AMPERSAND (&) Switch to the character 3333 * reference in attribute value state, with the 3334 * + additional allowed character being U+0027 3335 * APOSTROPHE ('). 3336 */ 3337 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 3338 appendCharRefBuf(c); 3339 setAdditionalAndRememberAmpersandLocation('\''); 3340 returnState = state; 3341 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 3342 // `break` optimizes; `continue stateloop;` would be valid 3343 break attributevaluesinglequotedloop; 3344 case '\r': 3345 appendStrBufCarriageReturn(); 3346 break stateloop; 3347 case '\n': 3348 appendStrBufLineFeed(); 3349 continue; 3350 case '\u0000': 3351 c = '\uFFFD'; 3352 // CPPONLY: MOZ_FALLTHROUGH; 3353 default: 3354 /* 3355 * Anything else Append the current input 3356 * character to the current attribute's value. 3357 */ 3358 appendStrBuf(c); 3359 /* 3360 * Stay in the attribute value (double-quoted) 3361 * state. 3362 */ 3363 continue; 3364 } 3365 } 3366 // CPPONLY: MOZ_FALLTHROUGH; 3367 case CONSUME_CHARACTER_REFERENCE: 3368 if (++pos == endPos) { 3369 break stateloop; 3370 } 3371 c = checkChar(buf, pos); 3372 /* 3373 * Unlike the definition is the spec, this state does not 3374 * return a value and never requires the caller to 3375 * backtrack. This state takes care of emitting characters 3376 * or appending to the current attribute value. It also 3377 * takes care of that in the case when consuming the 3378 * character reference fails. 3379 */ 3380 /* 3381 * This section defines how to consume a character 3382 * reference. This definition is used when parsing character 3383 * references in text and in attributes. 3384 * 3385 * The behavior depends on the identity of the next 3386 * character (the one immediately after the U+0026 AMPERSAND 3387 * character): 3388 */ 3389 switch (c) { 3390 case ' ': 3391 case '\t': 3392 case '\n': 3393 case '\r': // we'll reconsume! 3394 case '\u000C': 3395 case '<': 3396 case '&': 3397 case '\u0000': 3398 case ';': 3399 emitOrAppendCharRefBuf(returnState); 3400 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3401 cstart = pos; 3402 } 3403 reconsume = true; 3404 state = transition(state, returnState, reconsume, pos); 3405 continue stateloop; 3406 case '#': 3407 /* 3408 * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER 3409 * SIGN. 3410 */ 3411 appendCharRefBuf('#'); 3412 state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos); 3413 continue stateloop; 3414 default: 3415 if (c == additional) { 3416 emitOrAppendCharRefBuf(returnState); 3417 reconsume = true; 3418 state = transition(state, returnState, reconsume, pos); 3419 continue stateloop; 3420 } 3421 if (c >= 'a' && c <= 'z') { 3422 firstCharKey = c - 'a' + 26; 3423 } else if (c >= 'A' && c <= 'Z') { 3424 firstCharKey = c - 'A'; 3425 } else { 3426 // No match 3427 if (c == ';') { 3428 errNoNamedCharacterMatch(); 3429 } 3430 emitOrAppendCharRefBuf(returnState); 3431 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3432 cstart = pos; 3433 } 3434 reconsume = true; 3435 state = transition(state, returnState, reconsume, pos); 3436 continue stateloop; 3437 } 3438 // Didn't fail yet 3439 appendCharRefBuf(c); 3440 state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos); 3441 // `break` optimizes; `continue stateloop;` would be valid 3442 break; 3443 } 3444 // CPPONLY: MOZ_FALLTHROUGH; 3445 case CHARACTER_REFERENCE_HILO_LOOKUP: 3446 { 3447 if (++pos == endPos) { 3448 break stateloop; 3449 } 3450 c = checkChar(buf, pos); 3451 /* 3452 * The data structure is as follows: 3453 * 3454 * HILO_ACCEL is a two-dimensional int array whose major 3455 * index corresponds to the second character of the 3456 * character reference (code point as index) and the 3457 * minor index corresponds to the first character of the 3458 * character reference (packed so that A-Z runs from 0 3459 * to 25 and a-z runs from 26 to 51). This layout makes 3460 * it easier to use the sparseness of the data structure 3461 * to omit parts of it: The second dimension of the 3462 * table is null when no character reference starts with 3463 * the character corresponding to that row. 3464 * 3465 * The int value HILO_ACCEL (by these indeces) is zero 3466 * if there exists no character reference starting with 3467 * that two-letter prefix. Otherwise, the value is an 3468 * int that packs two shorts so that the higher short is 3469 * the index of the highest character reference name 3470 * with that prefix in NAMES and the lower short 3471 * corresponds to the index of the lowest character 3472 * reference name with that prefix. (It happens that the 3473 * first two character reference names share their 3474 * prefix so the packed int cannot be 0 by packing the 3475 * two shorts.) 3476 * 3477 * NAMES is an array of byte arrays where each byte 3478 * array encodes the name of a character references as 3479 * ASCII. The names omit the first two letters of the 3480 * name. (Since storing the first two letters would be 3481 * redundant with the data contained in HILO_ACCEL.) The 3482 * entries are lexically sorted. 3483 * 3484 * For a given index in NAMES, the same index in VALUES 3485 * contains the corresponding expansion as an array of 3486 * two UTF-16 code units (either the character and 3487 * U+0000 or a suggogate pair). 3488 */ 3489 int hilo = 0; 3490 if (c <= 'z') { 3491 @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c]; 3492 if (row != null) { 3493 hilo = row[firstCharKey]; 3494 } 3495 } 3496 if (hilo == 0) { 3497 if (c == ';') { 3498 errNoNamedCharacterMatch(); 3499 } 3500 emitOrAppendCharRefBuf(returnState); 3501 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3502 cstart = pos; 3503 } 3504 reconsume = true; 3505 state = transition(state, returnState, reconsume, pos); 3506 continue stateloop; 3507 } 3508 // Didn't fail yet 3509 appendCharRefBuf(c); 3510 lo = hilo & 0xFFFF; 3511 hi = hilo >> 16; 3512 entCol = -1; 3513 candidate = -1; 3514 charRefBufMark = 0; 3515 state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos); 3516 // fallthrough optimizes; `continue stateloop;` would also be valid 3517 } 3518 // CPPONLY: MOZ_FALLTHROUGH; 3519 case CHARACTER_REFERENCE_TAIL: 3520 outer: for (;;) { 3521 if (++pos == endPos) { 3522 break stateloop; 3523 } 3524 c = checkChar(buf, pos); 3525 entCol++; 3526 /* 3527 * Consume the maximum number of characters possible, 3528 * with the consumed characters matching one of the 3529 * identifiers in the first column of the named 3530 * character references table (in a case-sensitive 3531 * manner). 3532 */ 3533 loloop: for (;;) { 3534 if (hi < lo) { 3535 break outer; 3536 } 3537 if (entCol == NamedCharacters.NAMES[lo].length()) { 3538 candidate = lo; 3539 charRefBufMark = charRefBufLen; 3540 lo++; 3541 } else if (entCol > NamedCharacters.NAMES[lo].length()) { 3542 break outer; 3543 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { 3544 lo++; 3545 } else { 3546 break loloop; 3547 } 3548 } 3549 3550 hiloop: for (;;) { 3551 if (hi < lo) { 3552 break outer; 3553 } 3554 if (entCol == NamedCharacters.NAMES[hi].length()) { 3555 break hiloop; 3556 } 3557 if (entCol > NamedCharacters.NAMES[hi].length()) { 3558 break outer; 3559 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { 3560 hi--; 3561 } else { 3562 break hiloop; 3563 } 3564 } 3565 3566 if (c == ';') { 3567 // If we see a semicolon, there cannot be a 3568 // longer match. Break the loop. However, before 3569 // breaking, take the longest match so far as the 3570 // candidate, if we are just about to complete a 3571 // match. 3572 if (entCol + 1 == NamedCharacters.NAMES[lo].length()) { 3573 candidate = lo; 3574 charRefBufMark = charRefBufLen; 3575 } 3576 break outer; 3577 } 3578 3579 if (hi < lo) { 3580 break outer; 3581 } 3582 appendCharRefBuf(c); 3583 continue; 3584 } 3585 3586 if (candidate == -1) { 3587 // reconsume deals with CR, LF or nul 3588 if (c == ';') { 3589 errNoNamedCharacterMatch(); 3590 } 3591 emitOrAppendCharRefBuf(returnState); 3592 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3593 cstart = pos; 3594 } 3595 reconsume = true; 3596 state = transition(state, returnState, reconsume, pos); 3597 continue stateloop; 3598 } else { 3599 // c can't be CR, LF or nul if we got here 3600 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; 3601 if (candidateName.length() == 0 3602 || candidateName.charAt(candidateName.length() - 1) != ';') { 3603 /* 3604 * If the last character matched is not a U+003B 3605 * SEMICOLON (;), there is a parse error. 3606 */ 3607 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3608 /* 3609 * If the entity is being consumed as part of an 3610 * attribute, and the last character matched is 3611 * not a U+003B SEMICOLON (;), 3612 */ 3613 char ch; 3614 if (charRefBufMark == charRefBufLen) { 3615 ch = c; 3616 } else { 3617 ch = charRefBuf[charRefBufMark]; 3618 } 3619 if (ch == '=' || (ch >= '0' && ch <= '9') 3620 || (ch >= 'A' && ch <= 'Z') 3621 || (ch >= 'a' && ch <= 'z')) { 3622 /* 3623 * and the next character is either a U+003D 3624 * EQUALS SIGN character (=) or in the range 3625 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, 3626 * U+0041 LATIN CAPITAL LETTER A to U+005A 3627 * LATIN CAPITAL LETTER Z, or U+0061 LATIN 3628 * SMALL LETTER A to U+007A LATIN SMALL 3629 * LETTER Z, then, for historical reasons, 3630 * all the characters that were matched 3631 * after the U+0026 AMPERSAND (&) must be 3632 * unconsumed, and nothing is returned. 3633 */ 3634 if (c == ';') { 3635 errNoNamedCharacterMatch(); 3636 } 3637 appendCharRefBufToStrBuf(); 3638 reconsume = true; 3639 state = transition(state, returnState, reconsume, pos); 3640 continue stateloop; 3641 } 3642 } 3643 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3644 errUnescapedAmpersandInterpretedAsCharacterReference(); 3645 } else { 3646 errNotSemicolonTerminated(); 3647 } 3648 } 3649 3650 /* 3651 * Otherwise, return a character token for the character 3652 * corresponding to the entity name (as given by the 3653 * second column of the named character references 3654 * table). 3655 */ 3656 // CPPONLY: completedNamedCharacterReference(); 3657 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; 3658 if ( 3659 // [NOCPP[ 3660 val.length == 1 3661 // ]NOCPP] 3662 // CPPONLY: val[1] == 0 3663 ) { 3664 emitOrAppendOne(val, returnState); 3665 } else { 3666 emitOrAppendTwo(val, returnState); 3667 } 3668 // this is so complicated! 3669 if (charRefBufMark < charRefBufLen) { 3670 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 3671 appendStrBuf(charRefBuf, charRefBufMark, 3672 charRefBufLen - charRefBufMark); 3673 } else { 3674 tokenHandler.characters(charRefBuf, charRefBufMark, 3675 charRefBufLen - charRefBufMark); 3676 } 3677 } 3678 // charRefBufLen will be zeroed below! 3679 3680 // Check if we broke out early with c being the last 3681 // character that matched as opposed to being the 3682 // first one that didn't match. In the case of an 3683 // early break, the next run on text should start 3684 // *after* the current character and the current 3685 // character shouldn't be reconsumed. 3686 boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen); 3687 charRefBufLen = 0; 3688 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3689 cstart = earlyBreak ? pos + 1 : pos; 3690 } 3691 reconsume = !earlyBreak; 3692 state = transition(state, returnState, reconsume, pos); 3693 continue stateloop; 3694 /* 3695 * If the markup contains I'm ¬it; I tell you, the 3696 * entity is parsed as "not", as in, I'm ¬it; I tell 3697 * you. But if the markup was I'm ∉ I tell you, 3698 * the entity would be parsed as "notin;", resulting in 3699 * I'm ∉ I tell you. 3700 */ 3701 } 3702 // no fallthrough, reordering opportunity 3703 case CONSUME_NCR: 3704 if (++pos == endPos) { 3705 break stateloop; 3706 } 3707 c = checkChar(buf, pos); 3708 value = 0; 3709 seenDigits = false; 3710 /* 3711 * The behavior further depends on the character after the 3712 * U+0023 NUMBER SIGN: 3713 */ 3714 switch (c) { 3715 case 'x': 3716 case 'X': 3717 3718 /* 3719 * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL 3720 * LETTER X Consume the X. 3721 * 3722 * Follow the steps below, but using the range of 3723 * characters U+0030 DIGIT ZERO through to U+0039 3724 * DIGIT NINE, U+0061 LATIN SMALL LETTER A through 3725 * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN 3726 * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL 3727 * LETTER F (in other words, 0-9, A-F, a-f). 3728 * 3729 * When it comes to interpreting the number, 3730 * interpret it as a hexadecimal number. 3731 */ 3732 appendCharRefBuf(c); 3733 state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos); 3734 continue stateloop; 3735 default: 3736 /* 3737 * Anything else Follow the steps below, but using 3738 * the range of characters U+0030 DIGIT ZERO through 3739 * to U+0039 DIGIT NINE (i.e. just 0-9). 3740 * 3741 * When it comes to interpreting the number, 3742 * interpret it as a decimal number. 3743 */ 3744 reconsume = true; 3745 state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos); 3746 // `break` optimizes; `continue stateloop;` would be valid 3747 break; 3748 } 3749 // CPPONLY: MOZ_FALLTHROUGH; 3750 case DECIMAL_NRC_LOOP: 3751 decimalloop: for (;;) { 3752 if (reconsume) { 3753 reconsume = false; 3754 } else { 3755 if (++pos == endPos) { 3756 break stateloop; 3757 } 3758 c = checkChar(buf, pos); 3759 } 3760 /* 3761 * Consume as many characters as match the range of 3762 * characters given above. 3763 */ 3764 assert value >= 0: "value must not become negative."; 3765 if (c >= '0' && c <= '9') { 3766 seenDigits = true; 3767 // Avoid overflow 3768 if (value <= 0x10FFFF) { 3769 value *= 10; 3770 value += c - '0'; 3771 } 3772 continue; 3773 } else if (c == ';') { 3774 if (seenDigits) { 3775 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3776 cstart = pos + 1; 3777 } 3778 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3779 // `break` optimizes; `continue stateloop;` would be valid 3780 break decimalloop; 3781 } else { 3782 errNoDigitsInNCR(); 3783 appendCharRefBuf(';'); 3784 emitOrAppendCharRefBuf(returnState); 3785 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3786 cstart = pos + 1; 3787 } 3788 state = transition(state, returnState, reconsume, pos); 3789 continue stateloop; 3790 } 3791 } else { 3792 /* 3793 * If no characters match the range, then don't 3794 * consume any characters (and unconsume the U+0023 3795 * NUMBER SIGN character and, if appropriate, the X 3796 * character). This is a parse error; nothing is 3797 * returned. 3798 * 3799 * Otherwise, if the next character is a U+003B 3800 * SEMICOLON, consume that too. If it isn't, there 3801 * is a parse error. 3802 */ 3803 if (!seenDigits) { 3804 errNoDigitsInNCR(); 3805 emitOrAppendCharRefBuf(returnState); 3806 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3807 cstart = pos; 3808 } 3809 reconsume = true; 3810 state = transition(state, returnState, reconsume, pos); 3811 continue stateloop; 3812 } else { 3813 errCharRefLacksSemicolon(); 3814 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3815 cstart = pos; 3816 } 3817 reconsume = true; 3818 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3819 // `break` optimizes; `continue stateloop;` would be valid 3820 break decimalloop; 3821 } 3822 } 3823 } 3824 // CPPONLY: MOZ_FALLTHROUGH; 3825 case HANDLE_NCR_VALUE: 3826 // WARNING previous state sets reconsume 3827 // We are not going to emit the contents of charRefBuf. 3828 charRefBufLen = 0; 3829 // XXX inline this case if the method size can take it 3830 handleNcrValue(returnState); 3831 state = transition(state, returnState, reconsume, pos); 3832 continue stateloop; 3833 // no fallthrough, reordering opportunity 3834 case HEX_NCR_LOOP: 3835 for (;;) { 3836 if (++pos == endPos) { 3837 break stateloop; 3838 } 3839 c = checkChar(buf, pos); 3840 /* 3841 * Consume as many characters as match the range of 3842 * characters given above. 3843 */ 3844 assert value >= 0: "value must not become negative."; 3845 if (c >= '0' && c <= '9') { 3846 seenDigits = true; 3847 // Avoid overflow 3848 if (value <= 0x10FFFF) { 3849 value *= 16; 3850 value += c - '0'; 3851 } 3852 continue; 3853 } else if (c >= 'A' && c <= 'F') { 3854 seenDigits = true; 3855 // Avoid overflow 3856 if (value <= 0x10FFFF) { 3857 value *= 16; 3858 value += c - 'A' + 10; 3859 } 3860 continue; 3861 } else if (c >= 'a' && c <= 'f') { 3862 seenDigits = true; 3863 // Avoid overflow 3864 if (value <= 0x10FFFF) { 3865 value *= 16; 3866 value += c - 'a' + 10; 3867 } 3868 continue; 3869 } else if (c == ';') { 3870 if (seenDigits) { 3871 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3872 cstart = pos + 1; 3873 } 3874 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3875 continue stateloop; 3876 } else { 3877 errNoDigitsInNCR(); 3878 appendCharRefBuf(';'); 3879 emitOrAppendCharRefBuf(returnState); 3880 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3881 cstart = pos + 1; 3882 } 3883 state = transition(state, returnState, reconsume, pos); 3884 continue stateloop; 3885 } 3886 } else { 3887 /* 3888 * If no characters match the range, then don't 3889 * consume any characters (and unconsume the U+0023 3890 * NUMBER SIGN character and, if appropriate, the X 3891 * character). This is a parse error; nothing is 3892 * returned. 3893 * 3894 * Otherwise, if the next character is a U+003B 3895 * SEMICOLON, consume that too. If it isn't, there 3896 * is a parse error. 3897 */ 3898 if (!seenDigits) { 3899 errNoDigitsInNCR(); 3900 emitOrAppendCharRefBuf(returnState); 3901 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3902 cstart = pos; 3903 } 3904 reconsume = true; 3905 state = transition(state, returnState, reconsume, pos); 3906 continue stateloop; 3907 } else { 3908 errCharRefLacksSemicolon(); 3909 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 3910 cstart = pos; 3911 } 3912 reconsume = true; 3913 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 3914 continue stateloop; 3915 } 3916 } 3917 } 3918 // no fallthrough, reordering opportunity 3919 case PLAINTEXT: 3920 plaintextloop: for (;;) { 3921 if (reconsume) { 3922 reconsume = false; 3923 } else { 3924 ++pos; 3925 // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today. 3926 // The line below advances pos by some number of code units that this state is indifferent to. 3927 // CPPONLY: pos += accelerateAdvancementPlaintext(buf, pos, endPos); 3928 if (pos == endPos) { 3929 break stateloop; 3930 } 3931 c = checkChar(buf, pos); 3932 } 3933 switch (c) { 3934 case '\u0000': 3935 emitPlaintextReplacementCharacter(buf, pos); 3936 continue; 3937 case '\r': 3938 emitCarriageReturn(buf, pos); 3939 break stateloop; 3940 case '\n': 3941 silentLineFeed(); 3942 // CPPONLY: MOZ_FALLTHROUGH; 3943 default: 3944 /* 3945 * Anything else Emit the current input 3946 * character as a character token. Stay in the 3947 * RAWTEXT state. 3948 */ 3949 continue; 3950 } 3951 } 3952 // no fallthrough, reordering opportunity 3953 case CLOSE_TAG_OPEN: 3954 if (++pos == endPos) { 3955 break stateloop; 3956 } 3957 c = checkChar(buf, pos); 3958 /* 3959 * Otherwise, if the content model flag is set to the PCDATA 3960 * state, or if the next few characters do match that tag 3961 * name, consume the next input character: 3962 */ 3963 switch (c) { 3964 case '>': 3965 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 3966 errLtSlashGt(); 3967 /* 3968 * Switch to the data state. 3969 */ 3970 cstart = pos + 1; 3971 state = transition(state, Tokenizer.DATA, reconsume, pos); 3972 continue stateloop; 3973 case '\r': 3974 silentCarriageReturn(); 3975 /* Anything else Parse error. */ 3976 errGarbageAfterLtSlash(); 3977 /* 3978 * Switch to the bogus comment state. 3979 */ 3980 clearStrBufBeforeUse(); 3981 appendStrBuf('\n'); 3982 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3983 break stateloop; 3984 case '\n': 3985 silentLineFeed(); 3986 /* Anything else Parse error. */ 3987 errGarbageAfterLtSlash(); 3988 /* 3989 * Switch to the bogus comment state. 3990 */ 3991 clearStrBufBeforeUse(); 3992 appendStrBuf(c); 3993 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 3994 continue stateloop; 3995 case '\u0000': 3996 c = '\uFFFD'; 3997 // CPPONLY: MOZ_FALLTHROUGH; 3998 default: 3999 if (c >= 'A' && c <= 'Z') { 4000 c += 0x20; 4001 } 4002 if (c >= 'a' && c <= 'z') { 4003 /* 4004 * U+0061 LATIN SMALL LETTER A through to U+007A 4005 * LATIN SMALL LETTER Z Create a new end tag 4006 * token, 4007 */ 4008 endTag = true; 4009 /* 4010 * set its tag name to the input character, 4011 */ 4012 clearStrBufBeforeUse(); 4013 appendStrBuf(c); 4014 containsHyphen = false; 4015 /* 4016 * then switch to the tag name state. (Don't 4017 * emit the token yet; further details will be 4018 * filled in before it is emitted.) 4019 */ 4020 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 4021 continue stateloop; 4022 } else { 4023 /* Anything else Parse error. */ 4024 errGarbageAfterLtSlash(); 4025 /* 4026 * Switch to the bogus comment state. 4027 */ 4028 clearStrBufBeforeUse(); 4029 appendStrBuf(c); 4030 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4031 continue stateloop; 4032 } 4033 } 4034 // no fallthrough, reordering opportunity 4035 case RCDATA: 4036 rcdataloop: for (;;) { 4037 if (reconsume) { 4038 reconsume = false; 4039 } else { 4040 ++pos; 4041 // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today. 4042 // The line below advances pos by some number of code units that this state is indifferent to. 4043 // RCDATA and DATA have the same set of characters that they are indifferent to, hence accelerateData. 4044 // CPPONLY: pos += accelerateAdvancementData(buf, pos, endPos); 4045 if (pos == endPos) { 4046 break stateloop; 4047 } 4048 c = checkChar(buf, pos); 4049 } 4050 switch (c) { 4051 case '&': 4052 /* 4053 * U+0026 AMPERSAND (&) Switch to the character 4054 * reference in RCDATA state. 4055 */ 4056 flushChars(buf, pos); 4057 assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; 4058 appendCharRefBuf(c); 4059 setAdditionalAndRememberAmpersandLocation('\u0000'); 4060 returnState = state; 4061 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 4062 continue stateloop; 4063 case '<': 4064 /* 4065 * U+003C LESS-THAN SIGN (<) Switch to the 4066 * RCDATA less-than sign state. 4067 */ 4068 flushChars(buf, pos); 4069 4070 returnState = state; 4071 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); 4072 continue stateloop; 4073 case '\u0000': 4074 emitReplacementCharacter(buf, pos); 4075 continue; 4076 case '\r': 4077 emitCarriageReturn(buf, pos); 4078 break stateloop; 4079 case '\n': 4080 silentLineFeed(); 4081 // CPPONLY: MOZ_FALLTHROUGH; 4082 default: 4083 /* 4084 * Emit the current input character as a 4085 * character token. Stay in the RCDATA state. 4086 */ 4087 continue; 4088 } 4089 } 4090 // no fallthrough, reordering opportunity 4091 case RAWTEXT: 4092 rawtextloop: for (;;) { 4093 if (reconsume) { 4094 reconsume = false; 4095 } else { 4096 ++pos; 4097 // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today. 4098 // The line below advances pos by some number of code units that this state is indifferent to. 4099 // CPPONLY: pos += accelerateAdvancementRawtext(buf, pos, endPos); 4100 if (pos == endPos) { 4101 break stateloop; 4102 } 4103 c = checkChar(buf, pos); 4104 } 4105 switch (c) { 4106 case '<': 4107 /* 4108 * U+003C LESS-THAN SIGN (<) Switch to the 4109 * RAWTEXT less-than sign state. 4110 */ 4111 flushChars(buf, pos); 4112 4113 returnState = state; 4114 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); 4115 // `break` optimizes; `continue stateloop;` would be valid 4116 break rawtextloop; 4117 case '\u0000': 4118 emitReplacementCharacter(buf, pos); 4119 continue; 4120 case '\r': 4121 emitCarriageReturn(buf, pos); 4122 break stateloop; 4123 case '\n': 4124 silentLineFeed(); 4125 // CPPONLY: MOZ_FALLTHROUGH; 4126 default: 4127 /* 4128 * Emit the current input character as a 4129 * character token. Stay in the RAWTEXT state. 4130 */ 4131 continue; 4132 } 4133 } 4134 // CPPONLY: MOZ_FALLTHROUGH; 4135 case RAWTEXT_RCDATA_LESS_THAN_SIGN: 4136 rawtextrcdatalessthansignloop: for (;;) { 4137 if (++pos == endPos) { 4138 break stateloop; 4139 } 4140 c = checkChar(buf, pos); 4141 switch (c) { 4142 case '/': 4143 /* 4144 * U+002F SOLIDUS (/) Set the temporary buffer 4145 * to the empty string. Switch to the script 4146 * data end tag open state. 4147 */ 4148 index = 0; 4149 clearStrBufBeforeUse(); 4150 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 4151 // `break` optimizes; `continue stateloop;` would be valid 4152 break rawtextrcdatalessthansignloop; 4153 default: 4154 /* 4155 * Otherwise, emit a U+003C LESS-THAN SIGN 4156 * character token 4157 */ 4158 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4159 /* 4160 * and reconsume the current input character in 4161 * the data state. 4162 */ 4163 cstart = pos; 4164 reconsume = true; 4165 state = transition(state, returnState, reconsume, pos); 4166 continue stateloop; 4167 } 4168 } 4169 // CPPONLY: MOZ_FALLTHROUGH; 4170 case NON_DATA_END_TAG_NAME: 4171 for (;;) { 4172 if (++pos == endPos) { 4173 break stateloop; 4174 } 4175 c = checkChar(buf, pos); 4176 /* 4177 * ASSERT! when entering this state, set index to 0 and 4178 * call clearStrBufBeforeUse(); Let's implement the above 4179 * without lookahead. strBuf is the 'temporary buffer'. 4180 */ 4181 if (endTagExpectationAsArray == null) { 4182 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 4183 0, 2); 4184 cstart = pos; 4185 reconsume = true; 4186 state = transition(state, returnState, reconsume, pos); 4187 continue stateloop; 4188 } else if (index < endTagExpectationAsArray.length) { 4189 char e = endTagExpectationAsArray[index]; 4190 char folded = c; 4191 if (c >= 'A' && c <= 'Z') { 4192 folded += 0x20; 4193 } 4194 if (folded != e) { 4195 // [NOCPP[ 4196 errHtml4LtSlashInRcdata(folded); 4197 // ]NOCPP] 4198 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 4199 0, 2); 4200 emitStrBuf(); 4201 cstart = pos; 4202 reconsume = true; 4203 state = transition(state, returnState, reconsume, pos); 4204 continue stateloop; 4205 } 4206 appendStrBuf(c); 4207 index++; 4208 continue; 4209 } else { 4210 endTag = true; 4211 // XXX replace contentModelElement with different 4212 // type 4213 tagName = endTagExpectation; 4214 switch (c) { 4215 case '\r': 4216 silentCarriageReturn(); 4217 clearStrBufAfterUse(); // strBuf not used 4218 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 4219 break stateloop; 4220 case '\n': 4221 silentLineFeed(); 4222 // CPPONLY: MOZ_FALLTHROUGH; 4223 case ' ': 4224 case '\t': 4225 case '\u000C': 4226 /* 4227 * U+0009 CHARACTER TABULATION U+000A LINE 4228 * FEED (LF) U+000C FORM FEED (FF) U+0020 4229 * SPACE If the current end tag token is an 4230 * appropriate end tag token, then switch to 4231 * the before attribute name state. 4232 */ 4233 clearStrBufAfterUse(); // strBuf not used 4234 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 4235 continue stateloop; 4236 case '/': 4237 /* 4238 * U+002F SOLIDUS (/) If the current end tag 4239 * token is an appropriate end tag token, 4240 * then switch to the self-closing start tag 4241 * state. 4242 */ 4243 clearStrBufAfterUse(); // strBuf not used 4244 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 4245 continue stateloop; 4246 case '>': 4247 /* 4248 * U+003E GREATER-THAN SIGN (>) If the 4249 * current end tag token is an appropriate 4250 * end tag token, then emit the current tag 4251 * token and switch to the data state. 4252 */ 4253 clearStrBufAfterUse(); // strBuf not used 4254 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 4255 if (shouldSuspend) { 4256 break stateloop; 4257 } 4258 continue stateloop; 4259 default: 4260 /* 4261 * Emit a U+003C LESS-THAN SIGN character 4262 * token, a U+002F SOLIDUS character token, 4263 * a character token for each of the 4264 * characters in the temporary buffer (in 4265 * the order they were added to the buffer), 4266 * and reconsume the current input character 4267 * in the RAWTEXT state. 4268 */ 4269 // [NOCPP[ 4270 errWarnLtSlashInRcdata(); 4271 // ]NOCPP] 4272 tokenHandler.characters( 4273 Tokenizer.LT_SOLIDUS, 0, 2); 4274 emitStrBuf(); 4275 cstart = pos; // don't drop the 4276 // character 4277 reconsume = true; 4278 state = transition(state, returnState, reconsume, pos); 4279 continue stateloop; 4280 } 4281 } 4282 } 4283 // no fallthrough, reordering opportunity 4284 // BEGIN HOTSPOT WORKAROUND 4285 case BOGUS_COMMENT: 4286 boguscommentloop: for (;;) { 4287 if (reconsume) { 4288 reconsume = false; 4289 } else { 4290 if (++pos == endPos) { 4291 break stateloop; 4292 } 4293 c = checkChar(buf, pos); 4294 } 4295 /* 4296 * Consume every character up to and including the first 4297 * U+003E GREATER-THAN SIGN character (>) or the end of 4298 * the file (EOF), whichever comes first. Emit a comment 4299 * token whose data is the concatenation of all the 4300 * characters starting from and including the character 4301 * that caused the state machine to switch into the 4302 * bogus comment state, up to and including the 4303 * character immediately before the last consumed 4304 * character (i.e. up to the character just before the 4305 * U+003E or EOF character). (If the comment was started 4306 * by the end of the file (EOF), the token is empty.) 4307 * 4308 * Switch to the data state. 4309 * 4310 * If the end of the file was reached, reconsume the EOF 4311 * character. 4312 */ 4313 switch (c) { 4314 case '>': 4315 emitComment(0, pos); 4316 state = transition(state, Tokenizer.DATA, reconsume, pos); 4317 if (shouldSuspend) { 4318 break stateloop; 4319 } 4320 continue stateloop; 4321 case '-': 4322 appendStrBuf(c); 4323 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos); 4324 // `break` optimizes; `continue stateloop;` would be valid 4325 break boguscommentloop; 4326 case '\r': 4327 appendStrBufCarriageReturn(); 4328 break stateloop; 4329 case '\n': 4330 appendStrBufLineFeed(); 4331 continue; 4332 case '\u0000': 4333 c = '\uFFFD'; 4334 // CPPONLY: MOZ_FALLTHROUGH; 4335 default: 4336 appendStrBuf(c); 4337 continue; 4338 } 4339 } 4340 // CPPONLY: MOZ_FALLTHROUGH; 4341 case BOGUS_COMMENT_HYPHEN: 4342 boguscommenthyphenloop: for (;;) { 4343 if (++pos == endPos) { 4344 break stateloop; 4345 } 4346 c = checkChar(buf, pos); 4347 switch (c) { 4348 case '>': 4349 // [NOCPP[ 4350 maybeAppendSpaceToBogusComment(); 4351 // ]NOCPP] 4352 emitComment(0, pos); 4353 state = transition(state, Tokenizer.DATA, reconsume, pos); 4354 if (shouldSuspend) { 4355 break stateloop; 4356 } 4357 continue stateloop; 4358 case '-': 4359 appendSecondHyphenToBogusComment(); 4360 continue boguscommenthyphenloop; 4361 case '\r': 4362 appendStrBufCarriageReturn(); 4363 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4364 break stateloop; 4365 case '\n': 4366 appendStrBufLineFeed(); 4367 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4368 continue stateloop; 4369 case '\u0000': 4370 c = '\uFFFD'; 4371 // CPPONLY: MOZ_FALLTHROUGH; 4372 default: 4373 appendStrBuf(c); 4374 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 4375 continue stateloop; 4376 } 4377 } 4378 // no fallthrough, reordering opportunity 4379 case SCRIPT_DATA: 4380 scriptdataloop: for (;;) { 4381 if (reconsume) { 4382 reconsume = false; 4383 } else { 4384 ++pos; 4385 // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today. 4386 // The line below advances pos by some number of code units that this state is indifferent to. 4387 // Using `accelerateAdvancementRawtext`, because this states has the same characters of interest as RAWTEXT. 4388 // CPPONLY: pos += accelerateAdvancementRawtext(buf, pos, endPos); 4389 if (pos == endPos) { 4390 break stateloop; 4391 } 4392 c = checkChar(buf, pos); 4393 } 4394 switch (c) { 4395 case '<': 4396 /* 4397 * U+003C LESS-THAN SIGN (<) Switch to the 4398 * script data less-than sign state. 4399 */ 4400 flushChars(buf, pos); 4401 returnState = state; 4402 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos); 4403 // `break` optimizes; `continue stateloop;` would be valid 4404 break scriptdataloop; 4405 case '\u0000': 4406 emitReplacementCharacter(buf, pos); 4407 continue; 4408 case '\r': 4409 emitCarriageReturn(buf, pos); 4410 break stateloop; 4411 case '\n': 4412 silentLineFeed(); 4413 // CPPONLY: MOZ_FALLTHROUGH; 4414 default: 4415 /* 4416 * Anything else Emit the current input 4417 * character as a character token. Stay in the 4418 * script data state. 4419 */ 4420 continue; 4421 } 4422 } 4423 // CPPONLY: MOZ_FALLTHROUGH; 4424 case SCRIPT_DATA_LESS_THAN_SIGN: 4425 scriptdatalessthansignloop: for (;;) { 4426 if (++pos == endPos) { 4427 break stateloop; 4428 } 4429 c = checkChar(buf, pos); 4430 switch (c) { 4431 case '/': 4432 /* 4433 * U+002F SOLIDUS (/) Set the temporary buffer 4434 * to the empty string. Switch to the script 4435 * data end tag open state. 4436 */ 4437 index = 0; 4438 clearStrBufBeforeUse(); 4439 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 4440 continue stateloop; 4441 case '!': 4442 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4443 cstart = pos; 4444 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos); 4445 // `break` optimizes; `continue stateloop;` would be valid 4446 break scriptdatalessthansignloop; 4447 default: 4448 /* 4449 * Otherwise, emit a U+003C LESS-THAN SIGN 4450 * character token 4451 */ 4452 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4453 /* 4454 * and reconsume the current input character in 4455 * the data state. 4456 */ 4457 cstart = pos; 4458 reconsume = true; 4459 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4460 continue stateloop; 4461 } 4462 } 4463 // CPPONLY: MOZ_FALLTHROUGH; 4464 case SCRIPT_DATA_ESCAPE_START: 4465 scriptdataescapestartloop: for (;;) { 4466 if (++pos == endPos) { 4467 break stateloop; 4468 } 4469 c = checkChar(buf, pos); 4470 /* 4471 * Consume the next input character: 4472 */ 4473 switch (c) { 4474 case '-': 4475 /* 4476 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4477 * HYPHEN-MINUS character token. Switch to the 4478 * script data escape start dash state. 4479 */ 4480 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos); 4481 // `break` optimizes; `continue stateloop;` would be valid 4482 break scriptdataescapestartloop; 4483 default: 4484 /* 4485 * Anything else Reconsume the current input 4486 * character in the script data state. 4487 */ 4488 reconsume = true; 4489 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4490 continue stateloop; 4491 } 4492 } 4493 // CPPONLY: MOZ_FALLTHROUGH; 4494 case SCRIPT_DATA_ESCAPE_START_DASH: 4495 scriptdataescapestartdashloop: for (;;) { 4496 if (++pos == endPos) { 4497 break stateloop; 4498 } 4499 c = checkChar(buf, pos); 4500 /* 4501 * Consume the next input character: 4502 */ 4503 switch (c) { 4504 case '-': 4505 /* 4506 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4507 * HYPHEN-MINUS character token. Switch to the 4508 * script data escaped dash dash state. 4509 */ 4510 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); 4511 // `break` optimizes; `continue stateloop;` would be valid 4512 break scriptdataescapestartdashloop; 4513 default: 4514 /* 4515 * Anything else Reconsume the current input 4516 * character in the script data state. 4517 */ 4518 reconsume = true; 4519 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4520 continue stateloop; 4521 } 4522 } 4523 // CPPONLY: MOZ_FALLTHROUGH; 4524 case SCRIPT_DATA_ESCAPED_DASH_DASH: 4525 scriptdataescapeddashdashloop: for (;;) { 4526 if (++pos == endPos) { 4527 break stateloop; 4528 } 4529 c = checkChar(buf, pos); 4530 /* 4531 * Consume the next input character: 4532 */ 4533 switch (c) { 4534 case '-': 4535 /* 4536 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4537 * HYPHEN-MINUS character token. Stay in the 4538 * script data escaped dash dash state. 4539 */ 4540 continue; 4541 case '<': 4542 /* 4543 * U+003C LESS-THAN SIGN (<) Switch to the 4544 * script data escaped less-than sign state. 4545 */ 4546 flushChars(buf, pos); 4547 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4548 continue stateloop; 4549 case '>': 4550 /* 4551 * U+003E GREATER-THAN SIGN (>) Emit a U+003E 4552 * GREATER-THAN SIGN character token. Switch to 4553 * the script data state. 4554 */ 4555 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4556 continue stateloop; 4557 case '\u0000': 4558 emitReplacementCharacter(buf, pos); 4559 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4560 break scriptdataescapeddashdashloop; 4561 case '\r': 4562 emitCarriageReturn(buf, pos); 4563 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4564 break stateloop; 4565 case '\n': 4566 silentLineFeed(); 4567 // CPPONLY: MOZ_FALLTHROUGH; 4568 default: 4569 /* 4570 * Anything else Emit the current input 4571 * character as a character token. Switch to the 4572 * script data escaped state. 4573 */ 4574 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4575 // `break` optimizes; `continue stateloop;` would be valid 4576 break scriptdataescapeddashdashloop; 4577 } 4578 } 4579 // CPPONLY: MOZ_FALLTHROUGH; 4580 case SCRIPT_DATA_ESCAPED: 4581 scriptdataescapedloop: for (;;) { 4582 if (reconsume) { 4583 reconsume = false; 4584 } else { 4585 ++pos; 4586 // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today. 4587 // The line below advances pos by some number of code units that this state is indifferent to. 4588 // CPPONLY: pos += accelerateAdvancementScriptDataEscaped(buf, pos, endPos); 4589 if (pos == endPos) { 4590 break stateloop; 4591 } 4592 c = checkChar(buf, pos); 4593 } 4594 /* 4595 * Consume the next input character: 4596 */ 4597 switch (c) { 4598 case '-': 4599 /* 4600 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4601 * HYPHEN-MINUS character token. Switch to the 4602 * script data escaped dash state. 4603 */ 4604 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos); 4605 // `break` optimizes; `continue stateloop;` would be valid 4606 break scriptdataescapedloop; 4607 case '<': 4608 /* 4609 * U+003C LESS-THAN SIGN (<) Switch to the 4610 * script data escaped less-than sign state. 4611 */ 4612 flushChars(buf, pos); 4613 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4614 continue stateloop; 4615 case '\u0000': 4616 emitReplacementCharacter(buf, pos); 4617 continue; 4618 case '\r': 4619 emitCarriageReturn(buf, pos); 4620 break stateloop; 4621 case '\n': 4622 silentLineFeed(); 4623 // CPPONLY: MOZ_FALLTHROUGH; 4624 default: 4625 /* 4626 * Anything else Emit the current input 4627 * character as a character token. Stay in the 4628 * script data escaped state. 4629 */ 4630 continue; 4631 } 4632 } 4633 // CPPONLY: MOZ_FALLTHROUGH; 4634 case SCRIPT_DATA_ESCAPED_DASH: 4635 scriptdataescapeddashloop: for (;;) { 4636 if (++pos == endPos) { 4637 break stateloop; 4638 } 4639 c = checkChar(buf, pos); 4640 /* 4641 * Consume the next input character: 4642 */ 4643 switch (c) { 4644 case '-': 4645 /* 4646 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4647 * HYPHEN-MINUS character token. Switch to the 4648 * script data escaped dash dash state. 4649 */ 4650 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); 4651 continue stateloop; 4652 case '<': 4653 /* 4654 * U+003C LESS-THAN SIGN (<) Switch to the 4655 * script data escaped less-than sign state. 4656 */ 4657 flushChars(buf, pos); 4658 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4659 // `break` optimizes; `continue stateloop;` would be valid 4660 break scriptdataescapeddashloop; 4661 case '\u0000': 4662 emitReplacementCharacter(buf, pos); 4663 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4664 continue stateloop; 4665 case '\r': 4666 emitCarriageReturn(buf, pos); 4667 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4668 break stateloop; 4669 case '\n': 4670 silentLineFeed(); 4671 // CPPONLY: MOZ_FALLTHROUGH; 4672 default: 4673 /* 4674 * Anything else Emit the current input 4675 * character as a character token. Switch to the 4676 * script data escaped state. 4677 */ 4678 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4679 continue stateloop; 4680 } 4681 } 4682 // CPPONLY: MOZ_FALLTHROUGH; 4683 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 4684 scriptdataescapedlessthanloop: for (;;) { 4685 if (++pos == endPos) { 4686 break stateloop; 4687 } 4688 c = checkChar(buf, pos); 4689 /* 4690 * Consume the next input character: 4691 */ 4692 switch (c) { 4693 case '/': 4694 /* 4695 * U+002F SOLIDUS (/) Set the temporary buffer 4696 * to the empty string. Switch to the script 4697 * data escaped end tag open state. 4698 */ 4699 index = 0; 4700 clearStrBufBeforeUse(); 4701 returnState = Tokenizer.SCRIPT_DATA_ESCAPED; 4702 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 4703 continue stateloop; 4704 case 'S': 4705 case 's': 4706 /* 4707 * U+0041 LATIN CAPITAL LETTER A through to 4708 * U+005A LATIN CAPITAL LETTER Z Emit a U+003C 4709 * LESS-THAN SIGN character token and the 4710 * current input character as a character token. 4711 */ 4712 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4713 cstart = pos; 4714 index = 1; 4715 /* 4716 * Set the temporary buffer to the empty string. 4717 * Append the lowercase version of the current 4718 * input character (add 0x0020 to the 4719 * character's code point) to the temporary 4720 * buffer. Switch to the script data double 4721 * escape start state. 4722 */ 4723 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos); 4724 // `break` optimizes; `continue stateloop;` would be valid 4725 break scriptdataescapedlessthanloop; 4726 default: 4727 /* 4728 * Anything else Emit a U+003C LESS-THAN SIGN 4729 * character token and reconsume the current 4730 * input character in the script data escaped 4731 * state. 4732 */ 4733 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 4734 cstart = pos; 4735 reconsume = true; 4736 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4737 continue stateloop; 4738 } 4739 } 4740 // CPPONLY: MOZ_FALLTHROUGH; 4741 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 4742 scriptdatadoubleescapestartloop: for (;;) { 4743 if (++pos == endPos) { 4744 break stateloop; 4745 } 4746 c = checkChar(buf, pos); 4747 assert index > 0; 4748 if (index < 6) { // SCRIPT_ARR.length 4749 char folded = c; 4750 if (c >= 'A' && c <= 'Z') { 4751 folded += 0x20; 4752 } 4753 if (folded != Tokenizer.SCRIPT_ARR[index]) { 4754 reconsume = true; 4755 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4756 continue stateloop; 4757 } 4758 index++; 4759 continue; 4760 } 4761 switch (c) { 4762 case '\r': 4763 emitCarriageReturn(buf, pos); 4764 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4765 break stateloop; 4766 case '\n': 4767 silentLineFeed(); 4768 // CPPONLY: MOZ_FALLTHROUGH; 4769 case ' ': 4770 case '\t': 4771 case '\u000C': 4772 case '/': 4773 case '>': 4774 /* 4775 * U+0009 CHARACTER TABULATION U+000A LINE FEED 4776 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 4777 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN 4778 * (>) Emit the current input character as a 4779 * character token. If the temporary buffer is 4780 * the string "script", then switch to the 4781 * script data double escaped state. 4782 */ 4783 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4784 // `break` optimizes; `continue stateloop;` would be valid 4785 break scriptdatadoubleescapestartloop; 4786 default: 4787 /* 4788 * Anything else Reconsume the current input 4789 * character in the script data escaped state. 4790 */ 4791 reconsume = true; 4792 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 4793 continue stateloop; 4794 } 4795 } 4796 // CPPONLY: MOZ_FALLTHROUGH; 4797 case SCRIPT_DATA_DOUBLE_ESCAPED: 4798 scriptdatadoubleescapedloop: for (;;) { 4799 if (reconsume) { 4800 reconsume = false; 4801 } else { 4802 if (++pos == endPos) { 4803 break stateloop; 4804 } 4805 c = checkChar(buf, pos); 4806 } 4807 /* 4808 * Consume the next input character: 4809 */ 4810 switch (c) { 4811 case '-': 4812 /* 4813 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4814 * HYPHEN-MINUS character token. Switch to the 4815 * script data double escaped dash state. 4816 */ 4817 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos); 4818 // `break` optimizes; `continue stateloop;` would be valid 4819 break scriptdatadoubleescapedloop; 4820 case '<': 4821 /* 4822 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4823 * LESS-THAN SIGN character token. Switch to the 4824 * script data double escaped less-than sign 4825 * state. 4826 */ 4827 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4828 continue stateloop; 4829 case '\u0000': 4830 emitReplacementCharacter(buf, pos); 4831 continue; 4832 case '\r': 4833 emitCarriageReturn(buf, pos); 4834 break stateloop; 4835 case '\n': 4836 silentLineFeed(); 4837 // CPPONLY: MOZ_FALLTHROUGH; 4838 default: 4839 /* 4840 * Anything else Emit the current input 4841 * character as a character token. Stay in the 4842 * script data double escaped state. 4843 */ 4844 continue; 4845 } 4846 } 4847 // CPPONLY: MOZ_FALLTHROUGH; 4848 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 4849 scriptdatadoubleescapeddashloop: for (;;) { 4850 if (++pos == endPos) { 4851 break stateloop; 4852 } 4853 c = checkChar(buf, pos); 4854 /* 4855 * Consume the next input character: 4856 */ 4857 switch (c) { 4858 case '-': 4859 /* 4860 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4861 * HYPHEN-MINUS character token. Switch to the 4862 * script data double escaped dash dash state. 4863 */ 4864 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos); 4865 // `break` optimizes; `continue stateloop;` would be valid 4866 break scriptdatadoubleescapeddashloop; 4867 case '<': 4868 /* 4869 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4870 * LESS-THAN SIGN character token. Switch to the 4871 * script data double escaped less-than sign 4872 * state. 4873 */ 4874 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4875 continue stateloop; 4876 case '\u0000': 4877 emitReplacementCharacter(buf, pos); 4878 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4879 continue stateloop; 4880 case '\r': 4881 emitCarriageReturn(buf, pos); 4882 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4883 break stateloop; 4884 case '\n': 4885 silentLineFeed(); 4886 // CPPONLY: MOZ_FALLTHROUGH; 4887 default: 4888 /* 4889 * Anything else Emit the current input 4890 * character as a character token. Switch to the 4891 * script data double escaped state. 4892 */ 4893 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4894 continue stateloop; 4895 } 4896 } 4897 // CPPONLY: MOZ_FALLTHROUGH; 4898 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 4899 scriptdatadoubleescapeddashdashloop: for (;;) { 4900 if (++pos == endPos) { 4901 break stateloop; 4902 } 4903 c = checkChar(buf, pos); 4904 /* 4905 * Consume the next input character: 4906 */ 4907 switch (c) { 4908 case '-': 4909 /* 4910 * U+002D HYPHEN-MINUS (-) Emit a U+002D 4911 * HYPHEN-MINUS character token. Stay in the 4912 * script data double escaped dash dash state. 4913 */ 4914 continue; 4915 case '<': 4916 /* 4917 * U+003C LESS-THAN SIGN (<) Emit a U+003C 4918 * LESS-THAN SIGN character token. Switch to the 4919 * script data double escaped less-than sign 4920 * state. 4921 */ 4922 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 4923 // `break` optimizes; `continue stateloop;` would be valid 4924 break scriptdatadoubleescapeddashdashloop; 4925 case '>': 4926 /* 4927 * U+003E GREATER-THAN SIGN (>) Emit a U+003E 4928 * GREATER-THAN SIGN character token. Switch to 4929 * the script data state. 4930 */ 4931 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 4932 continue stateloop; 4933 case '\u0000': 4934 emitReplacementCharacter(buf, pos); 4935 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4936 continue stateloop; 4937 case '\r': 4938 emitCarriageReturn(buf, pos); 4939 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4940 break stateloop; 4941 case '\n': 4942 silentLineFeed(); 4943 // CPPONLY: MOZ_FALLTHROUGH; 4944 default: 4945 /* 4946 * Anything else Emit the current input 4947 * character as a character token. Switch to the 4948 * script data double escaped state. 4949 */ 4950 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4951 continue stateloop; 4952 } 4953 } 4954 // CPPONLY: MOZ_FALLTHROUGH; 4955 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 4956 scriptdatadoubleescapedlessthanloop: for (;;) { 4957 if (++pos == endPos) { 4958 break stateloop; 4959 } 4960 c = checkChar(buf, pos); 4961 /* 4962 * Consume the next input character: 4963 */ 4964 switch (c) { 4965 case '/': 4966 /* 4967 * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS 4968 * character token. Set the temporary buffer to 4969 * the empty string. Switch to the script data 4970 * double escape end state. 4971 */ 4972 index = 0; 4973 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos); 4974 // `break` optimizes; `continue stateloop;` would be valid 4975 break scriptdatadoubleescapedlessthanloop; 4976 default: 4977 /* 4978 * Anything else Reconsume the current input 4979 * character in the script data double escaped 4980 * state. 4981 */ 4982 reconsume = true; 4983 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 4984 continue stateloop; 4985 } 4986 } 4987 // CPPONLY: MOZ_FALLTHROUGH; 4988 case SCRIPT_DATA_DOUBLE_ESCAPE_END: 4989 scriptdatadoubleescapeendloop: for (;;) { 4990 if (++pos == endPos) { 4991 break stateloop; 4992 } 4993 c = checkChar(buf, pos); 4994 if (index < 6) { // SCRIPT_ARR.length 4995 char folded = c; 4996 if (c >= 'A' && c <= 'Z') { 4997 folded += 0x20; 4998 } 4999 if (folded != Tokenizer.SCRIPT_ARR[index]) { 5000 reconsume = true; 5001 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 5002 continue stateloop; 5003 } 5004 index++; 5005 continue; 5006 } 5007 switch (c) { 5008 case '\r': 5009 emitCarriageReturn(buf, pos); 5010 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 5011 break stateloop; 5012 case '\n': 5013 silentLineFeed(); 5014 // CPPONLY: MOZ_FALLTHROUGH; 5015 case ' ': 5016 case '\t': 5017 case '\u000C': 5018 case '/': 5019 case '>': 5020 /* 5021 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5022 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5023 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN 5024 * (>) Emit the current input character as a 5025 * character token. If the temporary buffer is 5026 * the string "script", then switch to the 5027 * script data escaped state. 5028 */ 5029 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 5030 continue stateloop; 5031 default: 5032 /* 5033 * Reconsume the current input character in the 5034 * script data double escaped state. 5035 */ 5036 reconsume = true; 5037 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 5038 continue stateloop; 5039 } 5040 } 5041 // no fallthrough, reordering opportunity 5042 case MARKUP_DECLARATION_OCTYPE: 5043 markupdeclarationdoctypeloop: for (;;) { 5044 if (++pos == endPos) { 5045 break stateloop; 5046 } 5047 c = checkChar(buf, pos); 5048 if (index < 6) { // OCTYPE.length 5049 char folded = c; 5050 if (c >= 'A' && c <= 'Z') { 5051 folded += 0x20; 5052 } 5053 if (folded == Tokenizer.OCTYPE[index]) { 5054 appendStrBuf(c); 5055 } else { 5056 errBogusComment(); 5057 reconsume = true; 5058 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 5059 continue stateloop; 5060 } 5061 index++; 5062 continue; 5063 } else { 5064 reconsume = true; 5065 state = transition(state, Tokenizer.DOCTYPE, reconsume, pos); 5066 // `break` optimizes; `continue stateloop;` would be valid 5067 break markupdeclarationdoctypeloop; 5068 } 5069 } 5070 // CPPONLY: MOZ_FALLTHROUGH; 5071 case DOCTYPE: 5072 doctypeloop: for (;;) { 5073 if (reconsume) { 5074 reconsume = false; 5075 } else { 5076 if (++pos == endPos) { 5077 break stateloop; 5078 } 5079 c = checkChar(buf, pos); 5080 } 5081 initDoctypeFields(); 5082 /* 5083 * Consume the next input character: 5084 */ 5085 switch (c) { 5086 case '\r': 5087 silentCarriageReturn(); 5088 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 5089 break stateloop; 5090 case '\n': 5091 silentLineFeed(); 5092 // CPPONLY: MOZ_FALLTHROUGH; 5093 case ' ': 5094 case '\t': 5095 case '\u000C': 5096 /* 5097 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5098 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5099 * Switch to the before DOCTYPE name state. 5100 */ 5101 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 5102 // `break` optimizes; `continue stateloop;` would be valid 5103 break doctypeloop; 5104 default: 5105 /* 5106 * Anything else Parse error. 5107 */ 5108 errMissingSpaceBeforeDoctypeName(); 5109 /* 5110 * Reconsume the current character in the before 5111 * DOCTYPE name state. 5112 */ 5113 reconsume = true; 5114 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 5115 // `break` optimizes; `continue stateloop;` would be valid 5116 break doctypeloop; 5117 } 5118 } 5119 // CPPONLY: MOZ_FALLTHROUGH; 5120 case BEFORE_DOCTYPE_NAME: 5121 beforedoctypenameloop: for (;;) { 5122 if (reconsume) { 5123 reconsume = false; 5124 } else { 5125 if (++pos == endPos) { 5126 break stateloop; 5127 } 5128 c = checkChar(buf, pos); 5129 } 5130 /* 5131 * Consume the next input character: 5132 */ 5133 switch (c) { 5134 case '\r': 5135 silentCarriageReturn(); 5136 break stateloop; 5137 case '\n': 5138 silentLineFeed(); 5139 // CPPONLY: MOZ_FALLTHROUGH; 5140 case ' ': 5141 case '\t': 5142 case '\u000C': 5143 /* 5144 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5145 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5146 * in the before DOCTYPE name state. 5147 */ 5148 continue; 5149 case '>': 5150 /* 5151 * U+003E GREATER-THAN SIGN (>) Parse error. 5152 */ 5153 errNamelessDoctype(); 5154 /* 5155 * Create a new DOCTYPE token. Set its 5156 * force-quirks flag to on. 5157 */ 5158 forceQuirks = true; 5159 /* 5160 * Emit the token. 5161 */ 5162 emitDoctypeToken(pos); 5163 /* 5164 * Switch to the data state. 5165 */ 5166 state = transition(state, Tokenizer.DATA, reconsume, pos); 5167 if (shouldSuspend) { 5168 break stateloop; 5169 } 5170 continue stateloop; 5171 case '\u0000': 5172 c = '\uFFFD'; 5173 // CPPONLY: MOZ_FALLTHROUGH; 5174 default: 5175 if (c >= 'A' && c <= 'Z') { 5176 /* 5177 * U+0041 LATIN CAPITAL LETTER A through to 5178 * U+005A LATIN CAPITAL LETTER Z Create a 5179 * new DOCTYPE token. Set the token's name 5180 * to the lowercase version of the input 5181 * character (add 0x0020 to the character's 5182 * code point). 5183 */ 5184 c += 0x20; 5185 } 5186 /* Anything else Create a new DOCTYPE token. */ 5187 /* 5188 * Set the token's name name to the current 5189 * input character. 5190 */ 5191 clearStrBufBeforeUse(); 5192 appendStrBuf(c); 5193 /* 5194 * Switch to the DOCTYPE name state. 5195 */ 5196 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos); 5197 // `break` optimizes; `continue stateloop;` would be valid 5198 break beforedoctypenameloop; 5199 } 5200 } 5201 // CPPONLY: MOZ_FALLTHROUGH; 5202 case DOCTYPE_NAME: 5203 doctypenameloop: for (;;) { 5204 if (++pos == endPos) { 5205 break stateloop; 5206 } 5207 c = checkChar(buf, pos); 5208 /* 5209 * Consume the next input character: 5210 */ 5211 switch (c) { 5212 case '\r': 5213 silentCarriageReturn(); 5214 strBufToDoctypeName(); 5215 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); 5216 break stateloop; 5217 case '\n': 5218 silentLineFeed(); 5219 // CPPONLY: MOZ_FALLTHROUGH; 5220 case ' ': 5221 case '\t': 5222 case '\u000C': 5223 /* 5224 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5225 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5226 * Switch to the after DOCTYPE name state. 5227 */ 5228 strBufToDoctypeName(); 5229 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); 5230 // `break` optimizes; `continue stateloop;` would be valid 5231 break doctypenameloop; 5232 case '>': 5233 /* 5234 * U+003E GREATER-THAN SIGN (>) Emit the current 5235 * DOCTYPE token. 5236 */ 5237 strBufToDoctypeName(); 5238 emitDoctypeToken(pos); 5239 /* 5240 * Switch to the data state. 5241 */ 5242 state = transition(state, Tokenizer.DATA, reconsume, pos); 5243 if (shouldSuspend) { 5244 break stateloop; 5245 } 5246 continue stateloop; 5247 case '\u0000': 5248 c = '\uFFFD'; 5249 // CPPONLY: MOZ_FALLTHROUGH; 5250 default: 5251 /* 5252 * U+0041 LATIN CAPITAL LETTER A through to 5253 * U+005A LATIN CAPITAL LETTER Z Append the 5254 * lowercase version of the input character (add 5255 * 0x0020 to the character's code point) to the 5256 * current DOCTYPE token's name. 5257 */ 5258 if (c >= 'A' && c <= 'Z') { 5259 c += 0x0020; 5260 } 5261 /* 5262 * Anything else Append the current input 5263 * character to the current DOCTYPE token's 5264 * name. 5265 */ 5266 appendStrBuf(c); 5267 /* 5268 * Stay in the DOCTYPE name state. 5269 */ 5270 continue; 5271 } 5272 } 5273 // CPPONLY: MOZ_FALLTHROUGH; 5274 case AFTER_DOCTYPE_NAME: 5275 afterdoctypenameloop: for (;;) { 5276 if (++pos == endPos) { 5277 break stateloop; 5278 } 5279 c = checkChar(buf, pos); 5280 /* 5281 * Consume the next input character: 5282 */ 5283 switch (c) { 5284 case '\r': 5285 silentCarriageReturn(); 5286 break stateloop; 5287 case '\n': 5288 silentLineFeed(); 5289 // CPPONLY: MOZ_FALLTHROUGH; 5290 case ' ': 5291 case '\t': 5292 case '\u000C': 5293 /* 5294 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5295 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5296 * in the after DOCTYPE name state. 5297 */ 5298 continue; 5299 case '>': 5300 /* 5301 * U+003E GREATER-THAN SIGN (>) Emit the current 5302 * DOCTYPE token. 5303 */ 5304 emitDoctypeToken(pos); 5305 /* 5306 * Switch to the data state. 5307 */ 5308 state = transition(state, Tokenizer.DATA, reconsume, pos); 5309 if (shouldSuspend) { 5310 break stateloop; 5311 } 5312 continue stateloop; 5313 case 'p': 5314 case 'P': 5315 index = 0; 5316 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos); 5317 // `break` optimizes; `continue stateloop;` would be valid 5318 break afterdoctypenameloop; 5319 case 's': 5320 case 'S': 5321 index = 0; 5322 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos); 5323 continue stateloop; 5324 default: 5325 /* 5326 * Otherwise, this is the parse error. 5327 */ 5328 bogusDoctype(); 5329 5330 /* 5331 * Set the DOCTYPE token's force-quirks flag to 5332 * on. 5333 */ 5334 // done by bogusDoctype(); 5335 /* 5336 * Switch to the bogus DOCTYPE state. 5337 */ 5338 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5339 continue stateloop; 5340 } 5341 } 5342 // CPPONLY: MOZ_FALLTHROUGH; 5343 case DOCTYPE_UBLIC: 5344 doctypeublicloop: for (;;) { 5345 if (++pos == endPos) { 5346 break stateloop; 5347 } 5348 c = checkChar(buf, pos); 5349 /* 5350 * If the six characters starting from the current input 5351 * character are an ASCII case-insensitive match for the 5352 * word "PUBLIC", then consume those characters and 5353 * switch to the before DOCTYPE public identifier state. 5354 */ 5355 if (index < 5) { // UBLIC.length 5356 char folded = c; 5357 if (c >= 'A' && c <= 'Z') { 5358 folded += 0x20; 5359 } 5360 if (folded != Tokenizer.UBLIC[index]) { 5361 bogusDoctype(); 5362 // forceQuirks = true; 5363 reconsume = true; 5364 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5365 continue stateloop; 5366 } 5367 index++; 5368 continue; 5369 } else { 5370 reconsume = true; 5371 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos); 5372 // `break` optimizes; `continue stateloop;` would be valid 5373 break doctypeublicloop; 5374 } 5375 } 5376 // CPPONLY: MOZ_FALLTHROUGH; 5377 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 5378 afterdoctypepublickeywordloop: for (;;) { 5379 if (reconsume) { 5380 reconsume = false; 5381 } else { 5382 if (++pos == endPos) { 5383 break stateloop; 5384 } 5385 c = checkChar(buf, pos); 5386 } 5387 /* 5388 * Consume the next input character: 5389 */ 5390 switch (c) { 5391 case '\r': 5392 silentCarriageReturn(); 5393 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5394 break stateloop; 5395 case '\n': 5396 silentLineFeed(); 5397 // CPPONLY: MOZ_FALLTHROUGH; 5398 case ' ': 5399 case '\t': 5400 case '\u000C': 5401 /* 5402 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5403 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5404 * Switch to the before DOCTYPE public 5405 * identifier state. 5406 */ 5407 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5408 // `break` optimizes; `continue stateloop;` would be valid 5409 break afterdoctypepublickeywordloop; 5410 case '"': 5411 /* 5412 * U+0022 QUOTATION MARK (") Parse Error. 5413 */ 5414 errNoSpaceBetweenDoctypePublicKeywordAndQuote(); 5415 /* 5416 * Set the DOCTYPE token's public identifier to 5417 * the empty string (not missing), 5418 */ 5419 clearStrBufBeforeUse(); 5420 /* 5421 * then switch to the DOCTYPE public identifier 5422 * (double-quoted) state. 5423 */ 5424 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5425 continue stateloop; 5426 case '\'': 5427 /* 5428 * U+0027 APOSTROPHE (') Parse Error. 5429 */ 5430 errNoSpaceBetweenDoctypePublicKeywordAndQuote(); 5431 /* 5432 * Set the DOCTYPE token's public identifier to 5433 * the empty string (not missing), 5434 */ 5435 clearStrBufBeforeUse(); 5436 /* 5437 * then switch to the DOCTYPE public identifier 5438 * (single-quoted) state. 5439 */ 5440 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5441 continue stateloop; 5442 case '>': 5443 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5444 errExpectedPublicId(); 5445 /* 5446 * Set the DOCTYPE token's force-quirks flag to 5447 * on. 5448 */ 5449 forceQuirks = true; 5450 /* 5451 * Emit that DOCTYPE token. 5452 */ 5453 emitDoctypeToken(pos); 5454 /* 5455 * Switch to the data state. 5456 */ 5457 state = transition(state, Tokenizer.DATA, reconsume, pos); 5458 if (shouldSuspend) { 5459 break stateloop; 5460 } 5461 continue stateloop; 5462 default: 5463 bogusDoctype(); 5464 /* 5465 * Set the DOCTYPE token's force-quirks flag to 5466 * on. 5467 */ 5468 // done by bogusDoctype(); 5469 /* 5470 * Switch to the bogus DOCTYPE state. 5471 */ 5472 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5473 continue stateloop; 5474 } 5475 } 5476 // CPPONLY: MOZ_FALLTHROUGH; 5477 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 5478 beforedoctypepublicidentifierloop: for (;;) { 5479 if (++pos == endPos) { 5480 break stateloop; 5481 } 5482 c = checkChar(buf, pos); 5483 /* 5484 * Consume the next input character: 5485 */ 5486 switch (c) { 5487 case '\r': 5488 silentCarriageReturn(); 5489 break stateloop; 5490 case '\n': 5491 silentLineFeed(); 5492 // CPPONLY: MOZ_FALLTHROUGH; 5493 case ' ': 5494 case '\t': 5495 case '\u000C': 5496 /* 5497 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5498 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5499 * in the before DOCTYPE public identifier 5500 * state. 5501 */ 5502 continue; 5503 case '"': 5504 /* 5505 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5506 * token's public identifier to the empty string 5507 * (not missing), 5508 */ 5509 clearStrBufBeforeUse(); 5510 /* 5511 * then switch to the DOCTYPE public identifier 5512 * (double-quoted) state. 5513 */ 5514 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5515 // `break` optimizes; `continue stateloop;` would be valid 5516 break beforedoctypepublicidentifierloop; 5517 case '\'': 5518 /* 5519 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5520 * public identifier to the empty string (not 5521 * missing), 5522 */ 5523 clearStrBufBeforeUse(); 5524 /* 5525 * then switch to the DOCTYPE public identifier 5526 * (single-quoted) state. 5527 */ 5528 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5529 continue stateloop; 5530 case '>': 5531 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 5532 errExpectedPublicId(); 5533 /* 5534 * Set the DOCTYPE token's force-quirks flag to 5535 * on. 5536 */ 5537 forceQuirks = true; 5538 /* 5539 * Emit that DOCTYPE token. 5540 */ 5541 emitDoctypeToken(pos); 5542 /* 5543 * Switch to the data state. 5544 */ 5545 state = transition(state, Tokenizer.DATA, reconsume, pos); 5546 if (shouldSuspend) { 5547 break stateloop; 5548 } 5549 continue stateloop; 5550 default: 5551 bogusDoctype(); 5552 /* 5553 * Set the DOCTYPE token's force-quirks flag to 5554 * on. 5555 */ 5556 // done by bogusDoctype(); 5557 /* 5558 * Switch to the bogus DOCTYPE state. 5559 */ 5560 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5561 continue stateloop; 5562 } 5563 } 5564 // CPPONLY: MOZ_FALLTHROUGH; 5565 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 5566 doctypepublicidentifierdoublequotedloop: for (;;) { 5567 if (++pos == endPos) { 5568 break stateloop; 5569 } 5570 c = checkChar(buf, pos); 5571 /* 5572 * Consume the next input character: 5573 */ 5574 switch (c) { 5575 case '"': 5576 /* 5577 * U+0022 QUOTATION MARK (") Switch to the after 5578 * DOCTYPE public identifier state. 5579 */ 5580 publicIdentifier = strBufToString(); 5581 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 5582 // `break` optimizes; `continue stateloop;` would be valid 5583 break doctypepublicidentifierdoublequotedloop; 5584 case '>': 5585 /* 5586 * U+003E GREATER-THAN SIGN (>) Parse error. 5587 */ 5588 errGtInPublicId(); 5589 /* 5590 * Set the DOCTYPE token's force-quirks flag to 5591 * on. 5592 */ 5593 forceQuirks = true; 5594 /* 5595 * Emit that DOCTYPE token. 5596 */ 5597 publicIdentifier = strBufToString(); 5598 emitDoctypeToken(pos); 5599 /* 5600 * Switch to the data state. 5601 */ 5602 state = transition(state, Tokenizer.DATA, reconsume, pos); 5603 if (shouldSuspend) { 5604 break stateloop; 5605 } 5606 continue stateloop; 5607 case '\r': 5608 appendStrBufCarriageReturn(); 5609 break stateloop; 5610 case '\n': 5611 appendStrBufLineFeed(); 5612 continue; 5613 case '\u0000': 5614 c = '\uFFFD'; 5615 // CPPONLY: MOZ_FALLTHROUGH; 5616 default: 5617 /* 5618 * Anything else Append the current input 5619 * character to the current DOCTYPE token's 5620 * public identifier. 5621 */ 5622 appendStrBuf(c); 5623 /* 5624 * Stay in the DOCTYPE public identifier 5625 * (double-quoted) state. 5626 */ 5627 continue; 5628 } 5629 } 5630 // CPPONLY: MOZ_FALLTHROUGH; 5631 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 5632 afterdoctypepublicidentifierloop: for (;;) { 5633 if (++pos == endPos) { 5634 break stateloop; 5635 } 5636 c = checkChar(buf, pos); 5637 /* 5638 * Consume the next input character: 5639 */ 5640 switch (c) { 5641 case '\r': 5642 silentCarriageReturn(); 5643 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); 5644 break stateloop; 5645 case '\n': 5646 silentLineFeed(); 5647 // CPPONLY: MOZ_FALLTHROUGH; 5648 case ' ': 5649 case '\t': 5650 case '\u000C': 5651 /* 5652 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5653 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 5654 * Switch to the between DOCTYPE public and 5655 * system identifiers state. 5656 */ 5657 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); 5658 // `break` optimizes; `continue stateloop;` would be valid 5659 break afterdoctypepublicidentifierloop; 5660 case '>': 5661 /* 5662 * U+003E GREATER-THAN SIGN (>) Emit the current 5663 * DOCTYPE token. 5664 */ 5665 emitDoctypeToken(pos); 5666 /* 5667 * Switch to the data state. 5668 */ 5669 state = transition(state, Tokenizer.DATA, reconsume, pos); 5670 if (shouldSuspend) { 5671 break stateloop; 5672 } 5673 continue stateloop; 5674 case '"': 5675 /* 5676 * U+0022 QUOTATION MARK (") Parse error. 5677 */ 5678 errNoSpaceBetweenPublicAndSystemIds(); 5679 /* 5680 * Set the DOCTYPE token's system identifier to 5681 * the empty string (not missing), 5682 */ 5683 clearStrBufBeforeUse(); 5684 /* 5685 * then switch to the DOCTYPE system identifier 5686 * (double-quoted) state. 5687 */ 5688 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5689 continue stateloop; 5690 case '\'': 5691 /* 5692 * U+0027 APOSTROPHE (') Parse error. 5693 */ 5694 errNoSpaceBetweenPublicAndSystemIds(); 5695 /* 5696 * Set the DOCTYPE token's system identifier to 5697 * the empty string (not missing), 5698 */ 5699 clearStrBufBeforeUse(); 5700 /* 5701 * then switch to the DOCTYPE system identifier 5702 * (single-quoted) state. 5703 */ 5704 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5705 continue stateloop; 5706 default: 5707 bogusDoctype(); 5708 /* 5709 * Set the DOCTYPE token's force-quirks flag to 5710 * on. 5711 */ 5712 // done by bogusDoctype(); 5713 /* 5714 * Switch to the bogus DOCTYPE state. 5715 */ 5716 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5717 continue stateloop; 5718 } 5719 } 5720 // CPPONLY: MOZ_FALLTHROUGH; 5721 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 5722 betweendoctypepublicandsystemidentifiersloop: for (;;) { 5723 if (++pos == endPos) { 5724 break stateloop; 5725 } 5726 c = checkChar(buf, pos); 5727 /* 5728 * Consume the next input character: 5729 */ 5730 switch (c) { 5731 case '\r': 5732 silentCarriageReturn(); 5733 break stateloop; 5734 case '\n': 5735 silentLineFeed(); 5736 // CPPONLY: MOZ_FALLTHROUGH; 5737 case ' ': 5738 case '\t': 5739 case '\u000C': 5740 /* 5741 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5742 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5743 * in the between DOCTYPE public and system 5744 * identifiers state. 5745 */ 5746 continue; 5747 case '>': 5748 /* 5749 * U+003E GREATER-THAN SIGN (>) Emit the current 5750 * DOCTYPE token. 5751 */ 5752 emitDoctypeToken(pos); 5753 /* 5754 * Switch to the data state. 5755 */ 5756 state = transition(state, Tokenizer.DATA, reconsume, pos); 5757 if (shouldSuspend) { 5758 break stateloop; 5759 } 5760 continue stateloop; 5761 case '"': 5762 /* 5763 * U+0022 QUOTATION MARK (") Set the DOCTYPE 5764 * token's system identifier to the empty string 5765 * (not missing), 5766 */ 5767 clearStrBufBeforeUse(); 5768 /* 5769 * then switch to the DOCTYPE system identifier 5770 * (double-quoted) state. 5771 */ 5772 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 5773 // `break` optimizes; `continue stateloop;` would be valid 5774 break betweendoctypepublicandsystemidentifiersloop; 5775 case '\'': 5776 /* 5777 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 5778 * system identifier to the empty string (not 5779 * missing), 5780 */ 5781 clearStrBufBeforeUse(); 5782 /* 5783 * then switch to the DOCTYPE system identifier 5784 * (single-quoted) state. 5785 */ 5786 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 5787 continue stateloop; 5788 default: 5789 bogusDoctype(); 5790 /* 5791 * Set the DOCTYPE token's force-quirks flag to 5792 * on. 5793 */ 5794 // done by bogusDoctype(); 5795 /* 5796 * Switch to the bogus DOCTYPE state. 5797 */ 5798 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5799 continue stateloop; 5800 } 5801 } 5802 // CPPONLY: MOZ_FALLTHROUGH; 5803 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 5804 doctypesystemidentifierdoublequotedloop: for (;;) { 5805 if (++pos == endPos) { 5806 break stateloop; 5807 } 5808 c = checkChar(buf, pos); 5809 /* 5810 * Consume the next input character: 5811 */ 5812 switch (c) { 5813 case '"': 5814 /* 5815 * U+0022 QUOTATION MARK (") Switch to the after 5816 * DOCTYPE system identifier state. 5817 */ 5818 systemIdentifier = strBufToString(); 5819 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 5820 // `break` optimizes; `continue stateloop;` would be valid 5821 break doctypesystemidentifierdoublequotedloop; 5822 case '>': 5823 /* 5824 * U+003E GREATER-THAN SIGN (>) Parse error. 5825 */ 5826 errGtInSystemId(); 5827 /* 5828 * Set the DOCTYPE token's force-quirks flag to 5829 * on. 5830 */ 5831 forceQuirks = true; 5832 /* 5833 * Emit that DOCTYPE token. 5834 */ 5835 systemIdentifier = strBufToString(); 5836 emitDoctypeToken(pos); 5837 /* 5838 * Switch to the data state. 5839 */ 5840 state = transition(state, Tokenizer.DATA, reconsume, pos); 5841 if (shouldSuspend) { 5842 break stateloop; 5843 } 5844 continue stateloop; 5845 case '\r': 5846 appendStrBufCarriageReturn(); 5847 break stateloop; 5848 case '\n': 5849 appendStrBufLineFeed(); 5850 continue; 5851 case '\u0000': 5852 c = '\uFFFD'; 5853 // CPPONLY: MOZ_FALLTHROUGH; 5854 default: 5855 /* 5856 * Anything else Append the current input 5857 * character to the current DOCTYPE token's 5858 * system identifier. 5859 */ 5860 appendStrBuf(c); 5861 /* 5862 * Stay in the DOCTYPE system identifier 5863 * (double-quoted) state. 5864 */ 5865 continue; 5866 } 5867 } 5868 // CPPONLY: MOZ_FALLTHROUGH; 5869 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 5870 afterdoctypesystemidentifierloop: for (;;) { 5871 if (++pos == endPos) { 5872 break stateloop; 5873 } 5874 c = checkChar(buf, pos); 5875 /* 5876 * Consume the next input character: 5877 */ 5878 switch (c) { 5879 case '\r': 5880 silentCarriageReturn(); 5881 break stateloop; 5882 case '\n': 5883 silentLineFeed(); 5884 // CPPONLY: MOZ_FALLTHROUGH; 5885 case ' ': 5886 case '\t': 5887 case '\u000C': 5888 /* 5889 * U+0009 CHARACTER TABULATION U+000A LINE FEED 5890 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 5891 * in the after DOCTYPE system identifier state. 5892 */ 5893 continue; 5894 case '>': 5895 /* 5896 * U+003E GREATER-THAN SIGN (>) Emit the current 5897 * DOCTYPE token. 5898 */ 5899 emitDoctypeToken(pos); 5900 /* 5901 * Switch to the data state. 5902 */ 5903 state = transition(state, Tokenizer.DATA, reconsume, pos); 5904 if (shouldSuspend) { 5905 break stateloop; 5906 } 5907 continue stateloop; 5908 default: 5909 /* 5910 * Switch to the bogus DOCTYPE state. (This does 5911 * not set the DOCTYPE token's force-quirks flag 5912 * to on.) 5913 */ 5914 bogusDoctypeWithoutQuirks(); 5915 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5916 // `break` optimizes; `continue stateloop;` would be valid 5917 break afterdoctypesystemidentifierloop; 5918 } 5919 } 5920 // CPPONLY: MOZ_FALLTHROUGH; 5921 case BOGUS_DOCTYPE: 5922 for (;;) { 5923 if (reconsume) { 5924 reconsume = false; 5925 } else { 5926 if (++pos == endPos) { 5927 break stateloop; 5928 } 5929 c = checkChar(buf, pos); 5930 } 5931 /* 5932 * Consume the next input character: 5933 */ 5934 switch (c) { 5935 case '>': 5936 /* 5937 * U+003E GREATER-THAN SIGN (>) Emit that 5938 * DOCTYPE token. 5939 */ 5940 emitDoctypeToken(pos); 5941 /* 5942 * Switch to the data state. 5943 */ 5944 state = transition(state, Tokenizer.DATA, reconsume, pos); 5945 if (shouldSuspend) { 5946 break stateloop; 5947 } 5948 continue stateloop; 5949 case '\r': 5950 silentCarriageReturn(); 5951 break stateloop; 5952 case '\n': 5953 silentLineFeed(); 5954 // CPPONLY: MOZ_FALLTHROUGH; 5955 default: 5956 /* 5957 * Anything else Stay in the bogus DOCTYPE 5958 * state. 5959 */ 5960 continue; 5961 } 5962 } 5963 // no fallthrough, reordering opportunity 5964 case DOCTYPE_YSTEM: 5965 doctypeystemloop: for (;;) { 5966 if (++pos == endPos) { 5967 break stateloop; 5968 } 5969 c = checkChar(buf, pos); 5970 /* 5971 * Otherwise, if the six characters starting from the 5972 * current input character are an ASCII case-insensitive 5973 * match for the word "SYSTEM", then consume those 5974 * characters and switch to the before DOCTYPE system 5975 * identifier state. 5976 */ 5977 if (index < 5) { // YSTEM.length 5978 char folded = c; 5979 if (c >= 'A' && c <= 'Z') { 5980 folded += 0x20; 5981 } 5982 if (folded != Tokenizer.YSTEM[index]) { 5983 bogusDoctype(); 5984 reconsume = true; 5985 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 5986 continue stateloop; 5987 } 5988 index++; 5989 continue stateloop; 5990 } else { 5991 reconsume = true; 5992 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos); 5993 // `break` optimizes; `continue stateloop;` would be valid 5994 break doctypeystemloop; 5995 } 5996 } 5997 // CPPONLY: MOZ_FALLTHROUGH; 5998 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 5999 afterdoctypesystemkeywordloop: for (;;) { 6000 if (reconsume) { 6001 reconsume = false; 6002 } else { 6003 if (++pos == endPos) { 6004 break stateloop; 6005 } 6006 c = checkChar(buf, pos); 6007 } 6008 /* 6009 * Consume the next input character: 6010 */ 6011 switch (c) { 6012 case '\r': 6013 silentCarriageReturn(); 6014 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 6015 break stateloop; 6016 case '\n': 6017 silentLineFeed(); 6018 // CPPONLY: MOZ_FALLTHROUGH; 6019 case ' ': 6020 case '\t': 6021 case '\u000C': 6022 /* 6023 * U+0009 CHARACTER TABULATION U+000A LINE FEED 6024 * (LF) U+000C FORM FEED (FF) U+0020 SPACE 6025 * Switch to the before DOCTYPE public 6026 * identifier state. 6027 */ 6028 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 6029 // `break` optimizes; `continue stateloop;` would be valid 6030 break afterdoctypesystemkeywordloop; 6031 case '"': 6032 /* 6033 * U+0022 QUOTATION MARK (") Parse Error. 6034 */ 6035 errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); 6036 /* 6037 * Set the DOCTYPE token's system identifier to 6038 * the empty string (not missing), 6039 */ 6040 clearStrBufBeforeUse(); 6041 /* 6042 * then switch to the DOCTYPE public identifier 6043 * (double-quoted) state. 6044 */ 6045 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 6046 continue stateloop; 6047 case '\'': 6048 /* 6049 * U+0027 APOSTROPHE (') Parse Error. 6050 */ 6051 errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); 6052 /* 6053 * Set the DOCTYPE token's public identifier to 6054 * the empty string (not missing), 6055 */ 6056 clearStrBufBeforeUse(); 6057 /* 6058 * then switch to the DOCTYPE public identifier 6059 * (single-quoted) state. 6060 */ 6061 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 6062 continue stateloop; 6063 case '>': 6064 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 6065 errExpectedPublicId(); 6066 /* 6067 * Set the DOCTYPE token's force-quirks flag to 6068 * on. 6069 */ 6070 forceQuirks = true; 6071 /* 6072 * Emit that DOCTYPE token. 6073 */ 6074 emitDoctypeToken(pos); 6075 /* 6076 * Switch to the data state. 6077 */ 6078 state = transition(state, Tokenizer.DATA, reconsume, pos); 6079 if (shouldSuspend) { 6080 break stateloop; 6081 } 6082 continue stateloop; 6083 default: 6084 bogusDoctype(); 6085 /* 6086 * Set the DOCTYPE token's force-quirks flag to 6087 * on. 6088 */ 6089 // done by bogusDoctype(); 6090 /* 6091 * Switch to the bogus DOCTYPE state. 6092 */ 6093 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 6094 continue stateloop; 6095 } 6096 } 6097 // CPPONLY: MOZ_FALLTHROUGH; 6098 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 6099 beforedoctypesystemidentifierloop: for (;;) { 6100 if (++pos == endPos) { 6101 break stateloop; 6102 } 6103 c = checkChar(buf, pos); 6104 /* 6105 * Consume the next input character: 6106 */ 6107 switch (c) { 6108 case '\r': 6109 silentCarriageReturn(); 6110 break stateloop; 6111 case '\n': 6112 silentLineFeed(); 6113 // CPPONLY: MOZ_FALLTHROUGH; 6114 case ' ': 6115 case '\t': 6116 case '\u000C': 6117 /* 6118 * U+0009 CHARACTER TABULATION U+000A LINE FEED 6119 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 6120 * in the before DOCTYPE system identifier 6121 * state. 6122 */ 6123 continue; 6124 case '"': 6125 /* 6126 * U+0022 QUOTATION MARK (") Set the DOCTYPE 6127 * token's system identifier to the empty string 6128 * (not missing), 6129 */ 6130 clearStrBufBeforeUse(); 6131 /* 6132 * then switch to the DOCTYPE system identifier 6133 * (double-quoted) state. 6134 */ 6135 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 6136 continue stateloop; 6137 case '\'': 6138 /* 6139 * U+0027 APOSTROPHE (') Set the DOCTYPE token's 6140 * system identifier to the empty string (not 6141 * missing), 6142 */ 6143 clearStrBufBeforeUse(); 6144 /* 6145 * then switch to the DOCTYPE system identifier 6146 * (single-quoted) state. 6147 */ 6148 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 6149 // `break` optimizes; `continue stateloop;` would be valid 6150 break beforedoctypesystemidentifierloop; 6151 case '>': 6152 /* U+003E GREATER-THAN SIGN (>) Parse error. */ 6153 errExpectedSystemId(); 6154 /* 6155 * Set the DOCTYPE token's force-quirks flag to 6156 * on. 6157 */ 6158 forceQuirks = true; 6159 /* 6160 * Emit that DOCTYPE token. 6161 */ 6162 emitDoctypeToken(pos); 6163 /* 6164 * Switch to the data state. 6165 */ 6166 state = transition(state, Tokenizer.DATA, reconsume, pos); 6167 if (shouldSuspend) { 6168 break stateloop; 6169 } 6170 continue stateloop; 6171 default: 6172 bogusDoctype(); 6173 /* 6174 * Set the DOCTYPE token's force-quirks flag to 6175 * on. 6176 */ 6177 // done by bogusDoctype(); 6178 /* 6179 * Switch to the bogus DOCTYPE state. 6180 */ 6181 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 6182 continue stateloop; 6183 } 6184 } 6185 // CPPONLY: MOZ_FALLTHROUGH; 6186 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 6187 for (;;) { 6188 if (++pos == endPos) { 6189 break stateloop; 6190 } 6191 c = checkChar(buf, pos); 6192 /* 6193 * Consume the next input character: 6194 */ 6195 switch (c) { 6196 case '\'': 6197 /* 6198 * U+0027 APOSTROPHE (') Switch to the after 6199 * DOCTYPE system identifier state. 6200 */ 6201 systemIdentifier = strBufToString(); 6202 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 6203 continue stateloop; 6204 case '>': 6205 errGtInSystemId(); 6206 /* 6207 * Set the DOCTYPE token's force-quirks flag to 6208 * on. 6209 */ 6210 forceQuirks = true; 6211 /* 6212 * Emit that DOCTYPE token. 6213 */ 6214 systemIdentifier = strBufToString(); 6215 emitDoctypeToken(pos); 6216 /* 6217 * Switch to the data state. 6218 */ 6219 state = transition(state, Tokenizer.DATA, reconsume, pos); 6220 if (shouldSuspend) { 6221 break stateloop; 6222 } 6223 continue stateloop; 6224 case '\r': 6225 appendStrBufCarriageReturn(); 6226 break stateloop; 6227 case '\n': 6228 appendStrBufLineFeed(); 6229 continue; 6230 case '\u0000': 6231 c = '\uFFFD'; 6232 // CPPONLY: MOZ_FALLTHROUGH; 6233 default: 6234 /* 6235 * Anything else Append the current input 6236 * character to the current DOCTYPE token's 6237 * system identifier. 6238 */ 6239 appendStrBuf(c); 6240 /* 6241 * Stay in the DOCTYPE system identifier 6242 * (double-quoted) state. 6243 */ 6244 continue; 6245 } 6246 } 6247 // no fallthrough, reordering opportunity 6248 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 6249 for (;;) { 6250 if (++pos == endPos) { 6251 break stateloop; 6252 } 6253 c = checkChar(buf, pos); 6254 /* 6255 * Consume the next input character: 6256 */ 6257 switch (c) { 6258 case '\'': 6259 /* 6260 * U+0027 APOSTROPHE (') Switch to the after 6261 * DOCTYPE public identifier state. 6262 */ 6263 publicIdentifier = strBufToString(); 6264 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 6265 continue stateloop; 6266 case '>': 6267 errGtInPublicId(); 6268 /* 6269 * Set the DOCTYPE token's force-quirks flag to 6270 * on. 6271 */ 6272 forceQuirks = true; 6273 /* 6274 * Emit that DOCTYPE token. 6275 */ 6276 publicIdentifier = strBufToString(); 6277 emitDoctypeToken(pos); 6278 /* 6279 * Switch to the data state. 6280 */ 6281 state = transition(state, Tokenizer.DATA, reconsume, pos); 6282 if (shouldSuspend) { 6283 break stateloop; 6284 } 6285 continue stateloop; 6286 case '\r': 6287 appendStrBufCarriageReturn(); 6288 break stateloop; 6289 case '\n': 6290 appendStrBufLineFeed(); 6291 continue; 6292 case '\u0000': 6293 c = '\uFFFD'; 6294 // CPPONLY: MOZ_FALLTHROUGH; 6295 default: 6296 /* 6297 * Anything else Append the current input 6298 * character to the current DOCTYPE token's 6299 * public identifier. 6300 */ 6301 appendStrBuf(c); 6302 /* 6303 * Stay in the DOCTYPE public identifier 6304 * (single-quoted) state. 6305 */ 6306 continue; 6307 } 6308 } 6309 // no fallthrough, reordering opportunity 6310 case PROCESSING_INSTRUCTION: 6311 processinginstructionloop: for (;;) { 6312 if (++pos == endPos) { 6313 break stateloop; 6314 } 6315 c = checkChar(buf, pos); 6316 switch (c) { 6317 case '?': 6318 state = transition( 6319 state, 6320 Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK, 6321 reconsume, pos); 6322 // `break` optimizes; `continue stateloop;` would be valid 6323 break processinginstructionloop; 6324 default: 6325 continue; 6326 } 6327 } 6328 // CPPONLY: MOZ_FALLTHROUGH; 6329 case PROCESSING_INSTRUCTION_QUESTION_MARK: 6330 if (++pos == endPos) { 6331 break stateloop; 6332 } 6333 c = checkChar(buf, pos); 6334 switch (c) { 6335 case '>': 6336 state = transition(state, Tokenizer.DATA, 6337 reconsume, pos); 6338 // Processing instruction syntax goes through these 6339 // states only in Gecko's XML View Source--not in HTML 6340 // parsing in Java or in Gecko. 6341 // Since XML View Source doesn't use the 6342 // suspension-after-current-token facility, its extension 6343 // to processing-instruction states is strictly unnecessary 6344 // at the moment. However, if these states ever were to be 6345 // used together with the suspension-after-current-token 6346 // facility, these states would need to participate, since 6347 // suspension could be requested when only less-than has been 6348 // seen and we don't yet know if we end up here. Handling 6349 // the currently-unnecessary case in order to avoid leaving 6350 // a trap for future modification. 6351 suspendIfRequestedAfterCurrentNonTextToken(); 6352 if (shouldSuspend) { 6353 break stateloop; 6354 } 6355 continue stateloop; 6356 default: 6357 state = transition(state, 6358 Tokenizer.PROCESSING_INSTRUCTION, 6359 reconsume, pos); 6360 continue stateloop; 6361 } 6362 // END HOTSPOT WORKAROUND 6363 } 6364 } 6365 flushChars(buf, pos); 6366 /* 6367 * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } 6368 */ 6369 // Save locals 6370 stateSave = state; 6371 returnStateSave = returnState; 6372 return pos; 6373 } 6374 6375 // HOTSPOT WORKAROUND INSERTION POINT 6376 6377 // [NOCPP[ 6378 6379 protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException { 6380 return to; 6381 } 6382 6383 // ]NOCPP] 6384 6385 private void initDoctypeFields() { 6386 // Discard the characters "DOCTYPE" accumulated as a potential bogus 6387 // comment into strBuf. 6388 clearStrBufAfterUse(); 6389 doctypeName = null; 6390 if (systemIdentifier != null) { 6391 Portability.releaseString(systemIdentifier); 6392 systemIdentifier = null; 6393 } 6394 if (publicIdentifier != null) { 6395 Portability.releaseString(publicIdentifier); 6396 publicIdentifier = null; 6397 } 6398 forceQuirks = false; 6399 } 6400 6401 @Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn() 6402 throws SAXException { 6403 silentCarriageReturn(); 6404 adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false); 6405 } 6406 6407 @Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed() 6408 throws SAXException { 6409 silentLineFeed(); 6410 adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false); 6411 } 6412 6413 @Inline private void appendStrBufLineFeed() { 6414 silentLineFeed(); 6415 appendStrBuf('\n'); 6416 } 6417 6418 @Inline private void appendStrBufCarriageReturn() { 6419 silentCarriageReturn(); 6420 appendStrBuf('\n'); 6421 } 6422 6423 // [NOCPP[ 6424 6425 @Inline protected void silentCarriageReturn() { 6426 ++line; 6427 lastCR = true; 6428 } 6429 6430 @Inline protected void silentLineFeed() { 6431 ++line; 6432 } 6433 6434 // ]NOCPP] 6435 6436 @Inline private void emitCarriageReturn(@NoLength char[] buf, int pos) 6437 throws SAXException { 6438 silentCarriageReturn(); 6439 flushChars(buf, pos); 6440 tokenHandler.characters(Tokenizer.LF, 0, 1); 6441 cstart = Integer.MAX_VALUE; 6442 } 6443 6444 private void emitReplacementCharacter(@NoLength char[] buf, int pos) 6445 throws SAXException { 6446 flushChars(buf, pos); 6447 tokenHandler.zeroOriginatingReplacementCharacter(); 6448 cstart = pos + 1; 6449 } 6450 6451 private void maybeEmitReplacementCharacter(@NoLength char[] buf, int pos) 6452 throws SAXException { 6453 flushChars(buf, pos); 6454 tokenHandler.zeroOrReplacementCharacter(); 6455 cstart = pos + 1; 6456 } 6457 6458 private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos) 6459 throws SAXException { 6460 flushChars(buf, pos); 6461 tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1); 6462 cstart = pos + 1; 6463 } 6464 6465 @Inline private void setAdditionalAndRememberAmpersandLocation(char add) { 6466 additional = add; 6467 // [NOCPP[ 6468 ampersandLocation = new LocatorImpl(this); 6469 // ]NOCPP] 6470 } 6471 6472 private void bogusDoctype() throws SAXException { 6473 errBogusDoctype(); 6474 forceQuirks = true; 6475 } 6476 6477 private void bogusDoctypeWithoutQuirks() throws SAXException { 6478 errBogusDoctype(); 6479 forceQuirks = false; 6480 } 6481 6482 private void handleNcrValue(int returnState) throws SAXException { 6483 /* 6484 * If one or more characters match the range, then take them all and 6485 * interpret the string of characters as a number (either hexadecimal or 6486 * decimal as appropriate). 6487 */ 6488 if (value <= 0xFFFF) { 6489 if (value >= 0x80 && value <= 0x9f) { 6490 /* 6491 * If that number is one of the numbers in the first column of 6492 * the following table, then this is a parse error. 6493 */ 6494 errNcrInC1Range(); 6495 /* 6496 * Find the row with that number in the first column, and return 6497 * a character token for the Unicode character given in the 6498 * second column of that row. 6499 */ 6500 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80]; 6501 emitOrAppendOne(val, returnState); 6502 // [NOCPP[ 6503 } else if (value == 0xC 6504 && contentSpacePolicy != XmlViolationPolicy.ALLOW) { 6505 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) { 6506 emitOrAppendOne(Tokenizer.SPACE, returnState); 6507 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) { 6508 fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space."); 6509 } 6510 // ]NOCPP] 6511 } else if (value == 0x0) { 6512 errNcrZero(); 6513 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 6514 } else if ((value & 0xF800) == 0xD800) { 6515 errNcrSurrogate(); 6516 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 6517 } else { 6518 /* 6519 * Otherwise, return a character token for the Unicode character 6520 * whose code point is that number. 6521 */ 6522 char ch = (char) value; 6523 // [NOCPP[ 6524 if (value == 0x0D) { 6525 errNcrCr(); 6526 } else if ((value <= 0x0008) || (value == 0x000B) 6527 || (value >= 0x000E && value <= 0x001F)) { 6528 ch = errNcrControlChar(ch); 6529 } else if (value >= 0xFDD0 && value <= 0xFDEF) { 6530 errNcrUnassigned(); 6531 } else if ((value & 0xFFFE) == 0xFFFE) { 6532 ch = errNcrNonCharacter(ch); 6533 } else if (value >= 0x007F && value <= 0x009F) { 6534 errNcrControlChar(); 6535 } else { 6536 maybeWarnPrivateUse(ch); 6537 } 6538 // ]NOCPP] 6539 bmpChar[0] = ch; 6540 emitOrAppendOne(bmpChar, returnState); 6541 } 6542 } else if (value <= 0x10FFFF) { 6543 // [NOCPP[ 6544 maybeWarnPrivateUseAstral(); 6545 if ((value & 0xFFFE) == 0xFFFE) { 6546 errAstralNonCharacter(value); 6547 } 6548 // ]NOCPP] 6549 astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10)); 6550 astralChar[1] = (char) (0xDC00 + (value & 0x3FF)); 6551 emitOrAppendTwo(astralChar, returnState); 6552 } else { 6553 errNcrOutOfRange(); 6554 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 6555 } 6556 } 6557 6558 public void eof() throws SAXException { 6559 int state = stateSave; 6560 int returnState = returnStateSave; 6561 6562 eofloop: for (;;) { 6563 switch (state) { 6564 case SCRIPT_DATA_LESS_THAN_SIGN: 6565 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 6566 /* 6567 * Otherwise, emit a U+003C LESS-THAN SIGN character token 6568 */ 6569 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 6570 /* 6571 * and reconsume the current input character in the data 6572 * state. 6573 */ 6574 break eofloop; 6575 case TAG_OPEN: 6576 /* 6577 * The behavior of this state depends on the content model 6578 * flag. 6579 */ 6580 /* 6581 * Anything else Parse error. 6582 */ 6583 errEofAfterLt(); 6584 /* 6585 * Emit a U+003C LESS-THAN SIGN character token 6586 */ 6587 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 6588 /* 6589 * and reconsume the current input character in the data 6590 * state. 6591 */ 6592 break eofloop; 6593 case RAWTEXT_RCDATA_LESS_THAN_SIGN: 6594 /* 6595 * Emit a U+003C LESS-THAN SIGN character token 6596 */ 6597 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 6598 /* 6599 * and reconsume the current input character in the RCDATA 6600 * state. 6601 */ 6602 break eofloop; 6603 case NON_DATA_END_TAG_NAME: 6604 /* 6605 * Emit a U+003C LESS-THAN SIGN character token, a U+002F 6606 * SOLIDUS character token, 6607 */ 6608 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); 6609 /* 6610 * a character token for each of the characters in the 6611 * temporary buffer (in the order they were added to the 6612 * buffer), 6613 */ 6614 emitStrBuf(); 6615 /* 6616 * and reconsume the current input character in the RCDATA 6617 * state. 6618 */ 6619 break eofloop; 6620 case CLOSE_TAG_OPEN: 6621 /* EOF Parse error. */ 6622 errEofAfterLt(); 6623 /* 6624 * Emit a U+003C LESS-THAN SIGN character token and a U+002F 6625 * SOLIDUS character token. 6626 */ 6627 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); 6628 /* 6629 * Reconsume the EOF character in the data state. 6630 */ 6631 break eofloop; 6632 case TAG_NAME: 6633 /* 6634 * EOF Parse error. 6635 */ 6636 errEofInTagName(); 6637 /* 6638 * Reconsume the EOF character in the data state. 6639 */ 6640 break eofloop; 6641 case BEFORE_ATTRIBUTE_NAME: 6642 case AFTER_ATTRIBUTE_VALUE_QUOTED: 6643 case SELF_CLOSING_START_TAG: 6644 /* EOF Parse error. */ 6645 errEofWithoutGt(); 6646 /* 6647 * Reconsume the EOF character in the data state. 6648 */ 6649 break eofloop; 6650 case ATTRIBUTE_NAME: 6651 /* 6652 * EOF Parse error. 6653 */ 6654 errEofInAttributeName(); 6655 /* 6656 * Reconsume the EOF character in the data state. 6657 */ 6658 break eofloop; 6659 case AFTER_ATTRIBUTE_NAME: 6660 case BEFORE_ATTRIBUTE_VALUE: 6661 /* EOF Parse error. */ 6662 errEofWithoutGt(); 6663 /* 6664 * Reconsume the EOF character in the data state. 6665 */ 6666 break eofloop; 6667 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 6668 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 6669 case ATTRIBUTE_VALUE_UNQUOTED: 6670 /* EOF Parse error. */ 6671 errEofInAttributeValue(); 6672 /* 6673 * Reconsume the EOF character in the data state. 6674 */ 6675 break eofloop; 6676 case BOGUS_COMMENT: 6677 emitComment(0, 0); 6678 break eofloop; 6679 case BOGUS_COMMENT_HYPHEN: 6680 // [NOCPP[ 6681 maybeAppendSpaceToBogusComment(); 6682 // ]NOCPP] 6683 emitComment(0, 0); 6684 break eofloop; 6685 case MARKUP_DECLARATION_OPEN: 6686 errBogusComment(); 6687 emitComment(0, 0); 6688 break eofloop; 6689 case MARKUP_DECLARATION_HYPHEN: 6690 errBogusComment(); 6691 emitComment(0, 0); 6692 break eofloop; 6693 case MARKUP_DECLARATION_OCTYPE: 6694 if (index < 6) { 6695 errBogusComment(); 6696 emitComment(0, 0); 6697 } else { 6698 /* EOF Parse error. */ 6699 errEofInDoctype(); 6700 /* 6701 * Create a new DOCTYPE token. Set its force-quirks flag 6702 * to on. 6703 */ 6704 doctypeName = null; 6705 if (systemIdentifier != null) { 6706 Portability.releaseString(systemIdentifier); 6707 systemIdentifier = null; 6708 } 6709 if (publicIdentifier != null) { 6710 Portability.releaseString(publicIdentifier); 6711 publicIdentifier = null; 6712 } 6713 forceQuirks = true; 6714 /* 6715 * Emit the token. 6716 */ 6717 emitDoctypeToken(0); 6718 /* 6719 * Reconsume the EOF character in the data state. 6720 */ 6721 break eofloop; 6722 } 6723 break eofloop; 6724 case COMMENT_START: 6725 case COMMENT: 6726 case COMMENT_LESSTHAN: 6727 case COMMENT_LESSTHAN_BANG: 6728 /* 6729 * EOF Parse error. 6730 */ 6731 errEofInComment(); 6732 /* Emit the comment token. */ 6733 emitComment(0, 0); 6734 /* 6735 * Reconsume the EOF character in the data state. 6736 */ 6737 break eofloop; 6738 case COMMENT_END: 6739 case COMMENT_LESSTHAN_BANG_DASH_DASH: 6740 errEofInComment(); 6741 /* Emit the comment token. */ 6742 emitComment(2, 0); 6743 /* 6744 * Reconsume the EOF character in the data state. 6745 */ 6746 break eofloop; 6747 case COMMENT_END_DASH: 6748 case COMMENT_START_DASH: 6749 case COMMENT_LESSTHAN_BANG_DASH: 6750 errEofInComment(); 6751 /* Emit the comment token. */ 6752 emitComment(1, 0); 6753 /* 6754 * Reconsume the EOF character in the data state. 6755 */ 6756 break eofloop; 6757 case COMMENT_END_BANG: 6758 errEofInComment(); 6759 /* Emit the comment token. */ 6760 emitComment(3, 0); 6761 /* 6762 * Reconsume the EOF character in the data state. 6763 */ 6764 break eofloop; 6765 case DOCTYPE: 6766 case BEFORE_DOCTYPE_NAME: 6767 errEofInDoctype(); 6768 /* 6769 * Create a new DOCTYPE token. Set its force-quirks flag to 6770 * on. 6771 */ 6772 forceQuirks = true; 6773 /* 6774 * Emit the token. 6775 */ 6776 emitDoctypeToken(0); 6777 /* 6778 * Reconsume the EOF character in the data state. 6779 */ 6780 break eofloop; 6781 case DOCTYPE_NAME: 6782 errEofInDoctype(); 6783 strBufToDoctypeName(); 6784 /* 6785 * Set the DOCTYPE token's force-quirks flag to on. 6786 */ 6787 forceQuirks = true; 6788 /* 6789 * Emit that DOCTYPE token. 6790 */ 6791 emitDoctypeToken(0); 6792 /* 6793 * Reconsume the EOF character in the data state. 6794 */ 6795 break eofloop; 6796 case DOCTYPE_UBLIC: 6797 case DOCTYPE_YSTEM: 6798 case AFTER_DOCTYPE_NAME: 6799 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 6800 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 6801 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 6802 errEofInDoctype(); 6803 /* 6804 * Set the DOCTYPE token's force-quirks flag to on. 6805 */ 6806 forceQuirks = true; 6807 /* 6808 * Emit that DOCTYPE token. 6809 */ 6810 emitDoctypeToken(0); 6811 /* 6812 * Reconsume the EOF character in the data state. 6813 */ 6814 break eofloop; 6815 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 6816 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 6817 /* EOF Parse error. */ 6818 errEofInPublicId(); 6819 /* 6820 * Set the DOCTYPE token's force-quirks flag to on. 6821 */ 6822 forceQuirks = true; 6823 /* 6824 * Emit that DOCTYPE token. 6825 */ 6826 publicIdentifier = strBufToString(); 6827 emitDoctypeToken(0); 6828 /* 6829 * Reconsume the EOF character in the data state. 6830 */ 6831 break eofloop; 6832 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 6833 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 6834 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 6835 errEofInDoctype(); 6836 /* 6837 * Set the DOCTYPE token's force-quirks flag to on. 6838 */ 6839 forceQuirks = true; 6840 /* 6841 * Emit that DOCTYPE token. 6842 */ 6843 emitDoctypeToken(0); 6844 /* 6845 * Reconsume the EOF character in the data state. 6846 */ 6847 break eofloop; 6848 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 6849 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 6850 /* EOF Parse error. */ 6851 errEofInSystemId(); 6852 /* 6853 * Set the DOCTYPE token's force-quirks flag to on. 6854 */ 6855 forceQuirks = true; 6856 /* 6857 * Emit that DOCTYPE token. 6858 */ 6859 systemIdentifier = strBufToString(); 6860 emitDoctypeToken(0); 6861 /* 6862 * Reconsume the EOF character in the data state. 6863 */ 6864 break eofloop; 6865 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 6866 errEofInDoctype(); 6867 /* 6868 * Set the DOCTYPE token's force-quirks flag to on. 6869 */ 6870 forceQuirks = true; 6871 /* 6872 * Emit that DOCTYPE token. 6873 */ 6874 emitDoctypeToken(0); 6875 /* 6876 * Reconsume the EOF character in the data state. 6877 */ 6878 break eofloop; 6879 case BOGUS_DOCTYPE: 6880 /* 6881 * Emit that DOCTYPE token. 6882 */ 6883 emitDoctypeToken(0); 6884 /* 6885 * Reconsume the EOF character in the data state. 6886 */ 6887 break eofloop; 6888 case CONSUME_CHARACTER_REFERENCE: 6889 /* 6890 * Unlike the definition is the spec, this state does not 6891 * return a value and never requires the caller to 6892 * backtrack. This state takes care of emitting characters 6893 * or appending to the current attribute value. It also 6894 * takes care of that in the case when consuming the entity 6895 * fails. 6896 */ 6897 /* 6898 * This section defines how to consume an entity. This 6899 * definition is used when parsing entities in text and in 6900 * attributes. 6901 * 6902 * The behavior depends on the identity of the next 6903 * character (the one immediately after the U+0026 AMPERSAND 6904 * character): 6905 */ 6906 6907 emitOrAppendCharRefBuf(returnState); 6908 state = returnState; 6909 continue; 6910 case CHARACTER_REFERENCE_HILO_LOOKUP: 6911 emitOrAppendCharRefBuf(returnState); 6912 state = returnState; 6913 continue; 6914 case CHARACTER_REFERENCE_TAIL: 6915 outer: for (;;) { 6916 char c = '\u0000'; 6917 entCol++; 6918 /* 6919 * Consume the maximum number of characters possible, 6920 * with the consumed characters matching one of the 6921 * identifiers in the first column of the named 6922 * character references table (in a case-sensitive 6923 * manner). 6924 */ 6925 hiloop: for (;;) { 6926 if (hi == -1) { 6927 break hiloop; 6928 } 6929 if (entCol == NamedCharacters.NAMES[hi].length()) { 6930 break hiloop; 6931 } 6932 if (entCol > NamedCharacters.NAMES[hi].length()) { 6933 break outer; 6934 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { 6935 hi--; 6936 } else { 6937 break hiloop; 6938 } 6939 } 6940 6941 loloop: for (;;) { 6942 if (hi < lo) { 6943 break outer; 6944 } 6945 if (entCol == NamedCharacters.NAMES[lo].length()) { 6946 candidate = lo; 6947 charRefBufMark = charRefBufLen; 6948 lo++; 6949 } else if (entCol > NamedCharacters.NAMES[lo].length()) { 6950 break outer; 6951 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { 6952 lo++; 6953 } else { 6954 break loloop; 6955 } 6956 } 6957 if (hi < lo) { 6958 break outer; 6959 } 6960 continue; 6961 } 6962 6963 if (candidate == -1) { 6964 emitOrAppendCharRefBuf(returnState); 6965 state = returnState; 6966 continue eofloop; 6967 } else { 6968 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; 6969 if (candidateName.length() == 0 6970 || candidateName.charAt(candidateName.length() - 1) != ';') { 6971 /* 6972 * If the last character matched is not a U+003B 6973 * SEMICOLON (;), there is a parse error. 6974 */ 6975 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 6976 /* 6977 * If the entity is being consumed as part of an 6978 * attribute, and the last character matched is 6979 * not a U+003B SEMICOLON (;), 6980 */ 6981 char ch; 6982 if (charRefBufMark == charRefBufLen) { 6983 ch = '\u0000'; 6984 } else { 6985 ch = charRefBuf[charRefBufMark]; 6986 } 6987 if ((ch >= '0' && ch <= '9') 6988 || (ch >= 'A' && ch <= 'Z') 6989 || (ch >= 'a' && ch <= 'z')) { 6990 /* 6991 * and the next character is in the range 6992 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, 6993 * U+0041 LATIN CAPITAL LETTER A to U+005A 6994 * LATIN CAPITAL LETTER Z, or U+0061 LATIN 6995 * SMALL LETTER A to U+007A LATIN SMALL 6996 * LETTER Z, then, for historical reasons, 6997 * all the characters that were matched 6998 * after the U+0026 AMPERSAND (&) must be 6999 * unconsumed, and nothing is returned. 7000 */ 7001 appendCharRefBufToStrBuf(); 7002 state = returnState; 7003 continue eofloop; 7004 } 7005 } 7006 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 7007 errUnescapedAmpersandInterpretedAsCharacterReference(); 7008 } else { 7009 errNotSemicolonTerminated(); 7010 } 7011 } 7012 7013 /* 7014 * Otherwise, return a character token for the character 7015 * corresponding to the entity name (as given by the 7016 * second column of the named character references 7017 * table). 7018 */ 7019 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; 7020 if ( 7021 // [NOCPP[ 7022 val.length == 1 7023 // ]NOCPP] 7024 // CPPONLY: val[1] == 0 7025 ) { 7026 emitOrAppendOne(val, returnState); 7027 } else { 7028 emitOrAppendTwo(val, returnState); 7029 } 7030 // this is so complicated! 7031 if (charRefBufMark < charRefBufLen) { 7032 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 7033 appendStrBuf(charRefBuf, charRefBufMark, 7034 charRefBufLen - charRefBufMark); 7035 } else { 7036 tokenHandler.characters(charRefBuf, charRefBufMark, 7037 charRefBufLen - charRefBufMark); 7038 } 7039 } 7040 charRefBufLen = 0; 7041 state = returnState; 7042 continue eofloop; 7043 /* 7044 * If the markup contains I'm ¬it; I tell you, the 7045 * entity is parsed as "not", as in, I'm ¬it; I tell 7046 * you. But if the markup was I'm ∉ I tell you, 7047 * the entity would be parsed as "notin;", resulting in 7048 * I'm ∉ I tell you. 7049 */ 7050 } 7051 case CONSUME_NCR: 7052 case DECIMAL_NRC_LOOP: 7053 case HEX_NCR_LOOP: 7054 /* 7055 * If no characters match the range, then don't consume any 7056 * characters (and unconsume the U+0023 NUMBER SIGN 7057 * character and, if appropriate, the X character). This is 7058 * a parse error; nothing is returned. 7059 * 7060 * Otherwise, if the next character is a U+003B SEMICOLON, 7061 * consume that too. If it isn't, there is a parse error. 7062 */ 7063 if (!seenDigits) { 7064 errNoDigitsInNCR(); 7065 emitOrAppendCharRefBuf(returnState); 7066 state = returnState; 7067 continue; 7068 } else { 7069 errCharRefLacksSemicolon(); 7070 } 7071 // WARNING previous state sets reconsume 7072 handleNcrValue(returnState); 7073 state = returnState; 7074 continue; 7075 case CDATA_RSQB: 7076 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); 7077 break eofloop; 7078 case CDATA_RSQB_RSQB: 7079 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); 7080 break eofloop; 7081 case DATA: 7082 default: 7083 break eofloop; 7084 } 7085 } 7086 // case DATA: 7087 /* 7088 * EOF Emit an end-of-file token. 7089 */ 7090 tokenHandler.eof(); 7091 return; 7092 } 7093 7094 /** 7095 * Emits a doctype token. 7096 * 7097 * NOTE: The method may set <code>shouldSuspend</code>, so the caller 7098 * must have this pattern after the state's <code>transition</code> call: 7099 * <pre> 7100 * if (shouldSuspend) { 7101 * break stateloop; 7102 * } 7103 * continue stateloop; 7104 * </pre> 7105 * 7106 * @param pos 7107 * @throws SAXException 7108 */ 7109 private void emitDoctypeToken(int pos) throws SAXException { 7110 // CPPONLY: RememberGt(pos); 7111 cstart = pos + 1; 7112 tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier, 7113 forceQuirks); 7114 // It is OK and sufficient to release these here, since 7115 // there's no way out of the doctype states than through paths 7116 // that call this method. 7117 doctypeName = null; 7118 Portability.releaseString(publicIdentifier); 7119 publicIdentifier = null; 7120 Portability.releaseString(systemIdentifier); 7121 systemIdentifier = null; 7122 suspendIfRequestedAfterCurrentNonTextToken(); 7123 } 7124 7125 /** 7126 * If a previous call to <code>suspendAfterCurrentTokenIfNotInText()</code> 7127 * happened in a non-text context, this method turns that deferred suspension 7128 * request into an immediately-pending suspension request. 7129 */ 7130 @Inline private void suspendIfRequestedAfterCurrentNonTextToken() { 7131 if (suspendAfterCurrentNonTextToken) { 7132 suspendAfterCurrentNonTextToken = false; 7133 shouldSuspend = true; 7134 } 7135 } 7136 7137 // Making this private until the full Java implementation is done. 7138 /** 7139 * Request suspension after the current token if the tokenizer is currently 7140 * in a non-text state (i.e. it's known that the next token will be a 7141 * non-text token). 7142 * 7143 * Must not be called when <code>tokenizeBuffer()</code> is on the call 7144 * stack. 7145 */ 7146 @SuppressWarnings("unused") private void suspendAfterCurrentTokenIfNotInText() { 7147 switch (stateSave) { 7148 case DATA: 7149 case RCDATA: 7150 case SCRIPT_DATA: 7151 case RAWTEXT: 7152 case SCRIPT_DATA_ESCAPED: 7153 case PLAINTEXT: 7154 case NON_DATA_END_TAG_NAME: // We haven't yet committed to the next 7155 // token being a non-text token, though 7156 // it could be. 7157 case SCRIPT_DATA_LESS_THAN_SIGN: 7158 case SCRIPT_DATA_ESCAPE_START: 7159 case SCRIPT_DATA_ESCAPE_START_DASH: 7160 case SCRIPT_DATA_ESCAPED_DASH: 7161 case SCRIPT_DATA_ESCAPED_DASH_DASH: 7162 case RAWTEXT_RCDATA_LESS_THAN_SIGN: 7163 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 7164 case SCRIPT_DATA_DOUBLE_ESCAPE_START: 7165 case SCRIPT_DATA_DOUBLE_ESCAPED: 7166 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 7167 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 7168 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 7169 case SCRIPT_DATA_DOUBLE_ESCAPE_END: 7170 return; 7171 case TAG_NAME: 7172 case BEFORE_ATTRIBUTE_NAME: 7173 case ATTRIBUTE_NAME: 7174 case AFTER_ATTRIBUTE_NAME: 7175 case BEFORE_ATTRIBUTE_VALUE: 7176 case AFTER_ATTRIBUTE_VALUE_QUOTED: 7177 case BOGUS_COMMENT: 7178 case MARKUP_DECLARATION_OPEN: 7179 case DOCTYPE: 7180 case BEFORE_DOCTYPE_NAME: 7181 case DOCTYPE_NAME: 7182 case AFTER_DOCTYPE_NAME: 7183 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 7184 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 7185 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 7186 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 7187 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 7188 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 7189 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 7190 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 7191 case BOGUS_DOCTYPE: 7192 case COMMENT_START: 7193 case COMMENT_START_DASH: 7194 case COMMENT: 7195 case COMMENT_END_DASH: 7196 case COMMENT_END: 7197 case COMMENT_END_BANG: 7198 case TAG_OPEN: 7199 case CLOSE_TAG_OPEN: 7200 case MARKUP_DECLARATION_HYPHEN: 7201 case MARKUP_DECLARATION_OCTYPE: 7202 case DOCTYPE_UBLIC: 7203 case DOCTYPE_YSTEM: 7204 case AFTER_DOCTYPE_PUBLIC_KEYWORD: 7205 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 7206 case AFTER_DOCTYPE_SYSTEM_KEYWORD: 7207 case SELF_CLOSING_START_TAG: 7208 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 7209 case ATTRIBUTE_VALUE_SINGLE_QUOTED: 7210 case ATTRIBUTE_VALUE_UNQUOTED: 7211 case BOGUS_COMMENT_HYPHEN: 7212 case COMMENT_LESSTHAN: 7213 case COMMENT_LESSTHAN_BANG: 7214 case COMMENT_LESSTHAN_BANG_DASH: 7215 case COMMENT_LESSTHAN_BANG_DASH_DASH: 7216 case CDATA_START: 7217 case CDATA_SECTION: 7218 case CDATA_RSQB: 7219 case CDATA_RSQB_RSQB: 7220 case PROCESSING_INSTRUCTION: 7221 case PROCESSING_INSTRUCTION_QUESTION_MARK: 7222 break; 7223 case CONSUME_CHARACTER_REFERENCE: 7224 case CONSUME_NCR: 7225 case CHARACTER_REFERENCE_TAIL: 7226 case HEX_NCR_LOOP: 7227 case DECIMAL_NRC_LOOP: 7228 case HANDLE_NCR_VALUE: 7229 case HANDLE_NCR_VALUE_RECONSUME: 7230 case CHARACTER_REFERENCE_HILO_LOOKUP: 7231 if (returnStateSave == DATA || returnStateSave == RCDATA) { 7232 return; 7233 } 7234 break; 7235 default: 7236 assert false : "Incomplete switch"; 7237 return; 7238 } 7239 suspendAfterCurrentNonTextToken = true; 7240 } 7241 7242 // Making this private until the full Java implementation is done. 7243 /** 7244 * Queries if we are about to suspend after the current non-text token due to a request 7245 * from <code>suspendAfterCurrentTokenIfNotInText()</code>. 7246 * @return <code>true</code> iff <code>suspendAfterCurrentTokenIfNotInText()</code> was 7247 * called in a non-text position and the then-current token has not been emitted yet. 7248 */ 7249 @SuppressWarnings("unused") private boolean suspensionAfterCurrentNonTextTokenPending() { 7250 return suspendAfterCurrentNonTextToken; 7251 } 7252 7253 // [NOCPP[ 7254 7255 @Inline protected char checkChar(@NoLength char[] buf, int pos) 7256 throws SAXException { 7257 return buf[pos]; 7258 } 7259 7260 // ]NOCPP] 7261 7262 public boolean internalEncodingDeclaration(String internalCharset) 7263 throws SAXException { 7264 if (encodingDeclarationHandler != null) { 7265 return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset); 7266 } 7267 return false; 7268 } 7269 7270 /** 7271 * @param val 7272 * @throws SAXException 7273 */ 7274 @Inline private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState) 7275 throws SAXException { 7276 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 7277 appendStrBuf(val[0]); 7278 appendStrBuf(val[1]); 7279 } else { 7280 tokenHandler.characters(val, 0, 2); 7281 } 7282 } 7283 7284 @Inline private void emitOrAppendOne(@Const @NoLength char[] val, int returnState) 7285 throws SAXException { 7286 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 7287 appendStrBuf(val[0]); 7288 } else { 7289 tokenHandler.characters(val, 0, 1); 7290 } 7291 } 7292 7293 public void end() throws SAXException { 7294 if (!keepBuffer) { 7295 strBuf = null; 7296 } 7297 doctypeName = null; 7298 if (systemIdentifier != null) { 7299 Portability.releaseString(systemIdentifier); 7300 systemIdentifier = null; 7301 } 7302 if (publicIdentifier != null) { 7303 Portability.releaseString(publicIdentifier); 7304 publicIdentifier = null; 7305 } 7306 tagName = null; 7307 nonInternedTagName.setNameForNonInterned(null 7308 // CPPONLY: , false 7309 ); 7310 attributeName = null; 7311 // CPPONLY: nonInternedAttributeName.setNameForNonInterned(null); 7312 tokenHandler.endTokenization(); 7313 if (attributes != null) { 7314 // [NOCPP[ 7315 attributes = null; 7316 // ]NOCPP] 7317 // CPPONLY: attributes.clear(mappingLangToXmlLang); 7318 } 7319 } 7320 7321 @Inline public void requestSuspension() { 7322 shouldSuspend = true; 7323 } 7324 7325 // [NOCPP[ 7326 7327 public void becomeConfident() { 7328 confident = true; 7329 } 7330 7331 /** 7332 * Returns the nextCharOnNewLine. 7333 * 7334 * @return the nextCharOnNewLine 7335 */ 7336 public boolean isNextCharOnNewLine() { 7337 return false; 7338 } 7339 7340 public boolean isPrevCR() { 7341 return lastCR; 7342 } 7343 7344 /** 7345 * Returns the line. 7346 * 7347 * @return the line 7348 */ 7349 public int getLine() { 7350 return -1; 7351 } 7352 7353 /** 7354 * Returns the col. 7355 * 7356 * @return the col 7357 */ 7358 public int getCol() { 7359 return -1; 7360 } 7361 7362 // ]NOCPP] 7363 7364 @Inline public boolean isInDataState() { 7365 return (stateSave == DATA); 7366 } 7367 7368 public void resetToDataState() { 7369 clearStrBufAfterUse(); 7370 charRefBufLen = 0; 7371 stateSave = Tokenizer.DATA; 7372 // line = 1; XXX line numbers 7373 lastCR = false; 7374 index = 0; 7375 forceQuirks = false; 7376 additional = '\u0000'; 7377 entCol = -1; 7378 firstCharKey = -1; 7379 lo = 0; 7380 hi = 0; // will always be overwritten before use anyway 7381 candidate = -1; 7382 charRefBufMark = 0; 7383 value = 0; 7384 seenDigits = false; 7385 suspendAfterCurrentNonTextToken = false; 7386 endTag = false; 7387 shouldSuspend = false; 7388 initDoctypeFields(); 7389 containsHyphen = false; 7390 tagName = null; 7391 attributeName = null; 7392 if (newAttributesEachTime) { 7393 if (attributes != null) { 7394 Portability.delete(attributes); 7395 attributes = null; 7396 } 7397 } 7398 } 7399 7400 public void loadState(Tokenizer other) throws SAXException { 7401 strBufLen = other.strBufLen; 7402 if (strBufLen > strBuf.length) { 7403 strBuf = new char[strBufLen]; 7404 } 7405 System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen); 7406 7407 charRefBufLen = other.charRefBufLen; 7408 System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen); 7409 7410 stateSave = other.stateSave; 7411 returnStateSave = other.returnStateSave; 7412 endTagExpectation = other.endTagExpectation; 7413 endTagExpectationAsArray = other.endTagExpectationAsArray; 7414 // line = 1; XXX line numbers 7415 lastCR = other.lastCR; 7416 index = other.index; 7417 forceQuirks = other.forceQuirks; 7418 additional = other.additional; 7419 entCol = other.entCol; 7420 firstCharKey = other.firstCharKey; 7421 lo = other.lo; 7422 hi = other.hi; 7423 candidate = other.candidate; 7424 charRefBufMark = other.charRefBufMark; 7425 value = other.value; 7426 seenDigits = other.seenDigits; 7427 endTag = other.endTag; 7428 shouldSuspend = false; 7429 suspendAfterCurrentNonTextToken = false; 7430 doctypeName = other.doctypeName; 7431 7432 Portability.releaseString(systemIdentifier); 7433 if (other.systemIdentifier == null) { 7434 systemIdentifier = null; 7435 } else { 7436 systemIdentifier = Portability.newStringFromString(other.systemIdentifier); 7437 } 7438 7439 Portability.releaseString(publicIdentifier); 7440 if (other.publicIdentifier == null) { 7441 publicIdentifier = null; 7442 } else { 7443 publicIdentifier = Portability.newStringFromString(other.publicIdentifier); 7444 } 7445 7446 containsHyphen = other.containsHyphen; 7447 if (other.tagName == null) { 7448 tagName = null; 7449 } else if (other.tagName.isInterned()) { 7450 tagName = other.tagName; 7451 } else { 7452 // In the C++ case, the atoms in the other tokenizer are from a 7453 // different tokenizer-scoped atom table. Therefore, we have to 7454 // obtain the correspoding atom from our own atom table. 7455 nonInternedTagName.setNameForNonInterned(other.tagName.getName() 7456 // CPPONLY: , other.tagName.isCustom() 7457 ); 7458 tagName = nonInternedTagName; 7459 } 7460 7461 // [NOCPP[ 7462 attributeName = other.attributeName; 7463 // ]NOCPP] 7464 // CPPONLY: if (other.attributeName == null) { 7465 // CPPONLY: attributeName = null; 7466 // CPPONLY: } else if (other.attributeName.isInterned()) { 7467 // CPPONLY: attributeName = other.attributeName; 7468 // CPPONLY: } else { 7469 // CPPONLY: // In the C++ case, the atoms in the other tokenizer are from a 7470 // CPPONLY: // different tokenizer-scoped atom table. Therefore, we have to 7471 // CPPONLY: // obtain the correspoding atom from our own atom table. 7472 // CPPONLY: nonInternedAttributeName.setNameForNonInterned(other.attributeName.getLocal(AttributeName.HTML)); 7473 // CPPONLY: attributeName = nonInternedAttributeName; 7474 // CPPONLY: } 7475 7476 Portability.delete(attributes); 7477 if (other.attributes == null) { 7478 attributes = null; 7479 } else { 7480 attributes = other.attributes.cloneAttributes(); 7481 } 7482 } 7483 7484 public void initializeWithoutStarting() throws SAXException { 7485 confident = false; 7486 if (!keepBuffer) { 7487 strBuf = null; 7488 } 7489 line = 1; 7490 // CPPONLY: attributeLine = 1; 7491 // [NOCPP[ 7492 metaBoundaryPassed = false; 7493 wantsComments = tokenHandler.wantsComments(); 7494 if (!newAttributesEachTime) { 7495 attributes = new HtmlAttributes(mappingLangToXmlLang); 7496 } 7497 // ]NOCPP] 7498 resetToDataState(); 7499 } 7500 7501 protected void errGarbageAfterLtSlash() throws SAXException { 7502 } 7503 7504 protected void errLtSlashGt() throws SAXException { 7505 } 7506 7507 protected void errWarnLtSlashInRcdata() throws SAXException { 7508 } 7509 7510 protected void errHtml4LtSlashInRcdata(char folded) throws SAXException { 7511 } 7512 7513 protected void errCharRefLacksSemicolon() throws SAXException { 7514 } 7515 7516 protected void errNoDigitsInNCR() throws SAXException { 7517 } 7518 7519 protected void errGtInSystemId() throws SAXException { 7520 } 7521 7522 protected void errGtInPublicId() throws SAXException { 7523 } 7524 7525 protected void errNamelessDoctype() throws SAXException { 7526 } 7527 7528 protected void errNestedComment() throws SAXException { 7529 } 7530 7531 protected void errPrematureEndOfComment() throws SAXException { 7532 } 7533 7534 protected void errBogusComment() throws SAXException { 7535 } 7536 7537 protected void errUnquotedAttributeValOrNull(char c) throws SAXException { 7538 } 7539 7540 protected void errSlashNotFollowedByGt() throws SAXException { 7541 } 7542 7543 protected void errNoSpaceBetweenAttributes() throws SAXException { 7544 } 7545 7546 protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c) 7547 throws SAXException { 7548 } 7549 7550 protected void errAttributeValueMissing() throws SAXException { 7551 } 7552 7553 protected void errBadCharBeforeAttributeNameOrNull(char c) 7554 throws SAXException { 7555 } 7556 7557 protected void errEqualsSignBeforeAttributeName() throws SAXException { 7558 } 7559 7560 protected void errBadCharAfterLt(char c) throws SAXException { 7561 } 7562 7563 protected void errLtGt() throws SAXException { 7564 } 7565 7566 protected void errProcessingInstruction() throws SAXException { 7567 } 7568 7569 protected void errUnescapedAmpersandInterpretedAsCharacterReference() 7570 throws SAXException { 7571 } 7572 7573 protected void errNotSemicolonTerminated() throws SAXException { 7574 } 7575 7576 protected void errNoNamedCharacterMatch() throws SAXException { 7577 } 7578 7579 protected void errQuoteBeforeAttributeName(char c) throws SAXException { 7580 } 7581 7582 protected void errQuoteOrLtInAttributeNameOrNull(char c) 7583 throws SAXException { 7584 } 7585 7586 protected void errExpectedPublicId() throws SAXException { 7587 } 7588 7589 protected void errBogusDoctype() throws SAXException { 7590 } 7591 7592 protected void maybeWarnPrivateUseAstral() throws SAXException { 7593 } 7594 7595 protected void maybeWarnPrivateUse(char ch) throws SAXException { 7596 } 7597 7598 protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs) 7599 throws SAXException { 7600 } 7601 7602 protected void maybeErrSlashInEndTag(boolean selfClosing) 7603 throws SAXException { 7604 } 7605 7606 protected char errNcrNonCharacter(char ch) throws SAXException { 7607 return ch; 7608 } 7609 7610 protected void errAstralNonCharacter(int ch) throws SAXException { 7611 } 7612 7613 protected void errNcrSurrogate() throws SAXException { 7614 } 7615 7616 protected char errNcrControlChar(char ch) throws SAXException { 7617 return ch; 7618 } 7619 7620 protected void errNcrCr() throws SAXException { 7621 } 7622 7623 protected void errNcrInC1Range() throws SAXException { 7624 } 7625 7626 protected void errEofInPublicId() throws SAXException { 7627 } 7628 7629 protected void errEofInComment() throws SAXException { 7630 } 7631 7632 protected void errEofInDoctype() throws SAXException { 7633 } 7634 7635 protected void errEofInAttributeValue() throws SAXException { 7636 } 7637 7638 protected void errEofInAttributeName() throws SAXException { 7639 } 7640 7641 protected void errEofWithoutGt() throws SAXException { 7642 } 7643 7644 protected void errEofInTagName() throws SAXException { 7645 } 7646 7647 protected void errEofInEndTag() throws SAXException { 7648 } 7649 7650 protected void errEofAfterLt() throws SAXException { 7651 } 7652 7653 protected void errNcrOutOfRange() throws SAXException { 7654 } 7655 7656 protected void errNcrUnassigned() throws SAXException { 7657 } 7658 7659 protected void errDuplicateAttribute() throws SAXException { 7660 } 7661 7662 protected void errEofInSystemId() throws SAXException { 7663 } 7664 7665 protected void errExpectedSystemId() throws SAXException { 7666 } 7667 7668 protected void errMissingSpaceBeforeDoctypeName() throws SAXException { 7669 } 7670 7671 protected void errNcrControlChar() throws SAXException { 7672 } 7673 7674 protected void errNcrZero() throws SAXException { 7675 } 7676 7677 protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote() 7678 throws SAXException { 7679 } 7680 7681 protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException { 7682 } 7683 7684 protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote() 7685 throws SAXException { 7686 } 7687 7688 protected void noteAttributeWithoutValue() throws SAXException { 7689 } 7690 7691 protected void noteUnquotedAttributeValue() throws SAXException { 7692 } 7693 7694 /** 7695 * Sets the encodingDeclarationHandler. 7696 * 7697 * @param encodingDeclarationHandler 7698 * the encodingDeclarationHandler to set 7699 */ 7700 public void setEncodingDeclarationHandler( 7701 EncodingDeclarationHandler encodingDeclarationHandler) { 7702 this.encodingDeclarationHandler = encodingDeclarationHandler; 7703 } 7704 7705 void destructor() { 7706 Portability.delete(nonInternedTagName); 7707 nonInternedTagName = null; 7708 // CPPONLY: Portability.delete(nonInternedAttributeName); 7709 // CPPONLY: nonInternedAttributeName = null; 7710 // The translator will write refcount tracing stuff here 7711 Portability.delete(attributes); 7712 attributes = null; 7713 } 7714 7715 // [NOCPP[ 7716 7717 /** 7718 * Sets an offset to be added to the position reported to 7719 * <code>TransitionHandler</code>. 7720 * 7721 * @param offset the offset 7722 */ 7723 public void setTransitionBaseOffset(int offset) { 7724 7725 } 7726 7727 // ]NOCPP] 7728 7729 }