tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

Tokenizer.java (351838B)


      1 /*
      2 * Copyright (c) 2005-2007 Henri Sivonen
      3 * Copyright (c) 2007-2017 Mozilla Foundation
      4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
      5 * Foundation, and Opera Software ASA.
      6 *
      7 * Permission is hereby granted, free of charge, to any person obtaining a
      8 * copy of this software and associated documentation files (the "Software"),
      9 * to deal in the Software without restriction, including without limitation
     10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     11 * and/or sell copies of the Software, and to permit persons to whom the
     12 * Software is furnished to do so, subject to the following conditions:
     13 *
     14 * The above copyright notice and this permission notice shall be included in
     15 * all copies or substantial portions of the Software.
     16 *
     17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     23 * DEALINGS IN THE SOFTWARE.
     24 */
     25 
     26 /*
     27 * The comments following this one that use the same comment syntax as this
     28 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
     29 * amended as of June 18 2008 and May 31 2010.
     30 * That document came with this statement:
     31 * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
     32 * Opera Software ASA. You are granted a license to use, reproduce and
     33 * create derivative works of this document."
     34 */
     35 
     36 package nu.validator.htmlparser.impl;
     37 
     38 import java.util.HashMap;
     39 
     40 import org.xml.sax.ErrorHandler;
     41 import org.xml.sax.Locator;
     42 import org.xml.sax.ext.Locator2;
     43 import org.xml.sax.SAXException;
     44 import org.xml.sax.SAXParseException;
     45 
     46 import nu.validator.htmlparser.annotation.Auto;
     47 import nu.validator.htmlparser.annotation.CharacterName;
     48 import nu.validator.htmlparser.annotation.Const;
     49 import nu.validator.htmlparser.annotation.Inline;
     50 import nu.validator.htmlparser.annotation.Local;
     51 import nu.validator.htmlparser.annotation.NoLength;
     52 import nu.validator.htmlparser.common.EncodingDeclarationHandler;
     53 import nu.validator.htmlparser.common.Interner;
     54 import nu.validator.htmlparser.common.TokenHandler;
     55 import nu.validator.htmlparser.common.XmlViolationPolicy;
     56 
     57 /**
     58 * An implementation of
     59 * https://html.spec.whatwg.org/multipage/syntax.html#tokenization
     60 *
     61 * This class implements the <code>Locator</code> interface. This is not an
     62 * incidental implementation detail: Users of this class are encouraged to make
     63 * use of the <code>Locator</code> nature.
     64 *
     65 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
     66 * can be configured to treat these conditions as fatal or to coerce the infoset
     67 * to something that XML 1.0 allows.
     68 *
     69 * @version $Id$
     70 * @author hsivonen
     71 */
     72 public class Tokenizer implements Locator, Locator2 {
     73 
     74    private static final int DATA_AND_RCDATA_MASK = ~1;
     75 
     76    public static final int DATA = 0;
     77 
     78    public static final int RCDATA = 1;
     79 
     80    public static final int SCRIPT_DATA = 2;
     81 
     82    public static final int RAWTEXT = 3;
     83 
     84    public static final int SCRIPT_DATA_ESCAPED = 4;
     85 
     86    public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
     87 
     88    public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
     89 
     90    public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
     91 
     92    public static final int PLAINTEXT = 8;
     93 
     94    public static final int TAG_OPEN = 9;
     95 
     96    public static final int CLOSE_TAG_OPEN = 10;
     97 
     98    public static final int TAG_NAME = 11;
     99 
    100    public static final int BEFORE_ATTRIBUTE_NAME = 12;
    101 
    102    public static final int ATTRIBUTE_NAME = 13;
    103 
    104    public static final int AFTER_ATTRIBUTE_NAME = 14;
    105 
    106    public static final int BEFORE_ATTRIBUTE_VALUE = 15;
    107 
    108    public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
    109 
    110    public static final int BOGUS_COMMENT = 17;
    111 
    112    public static final int MARKUP_DECLARATION_OPEN = 18;
    113 
    114    public static final int DOCTYPE = 19;
    115 
    116    public static final int BEFORE_DOCTYPE_NAME = 20;
    117 
    118    public static final int DOCTYPE_NAME = 21;
    119 
    120    public static final int AFTER_DOCTYPE_NAME = 22;
    121 
    122    public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
    123 
    124    public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
    125 
    126    public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
    127 
    128    public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
    129 
    130    public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
    131 
    132    public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
    133 
    134    public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
    135 
    136    public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
    137 
    138    public static final int BOGUS_DOCTYPE = 31;
    139 
    140    public static final int COMMENT_START = 32;
    141 
    142    public static final int COMMENT_START_DASH = 33;
    143 
    144    public static final int COMMENT = 34;
    145 
    146    public static final int COMMENT_END_DASH = 35;
    147 
    148    public static final int COMMENT_END = 36;
    149 
    150    public static final int COMMENT_END_BANG = 37;
    151 
    152    public static final int NON_DATA_END_TAG_NAME = 38;
    153 
    154    public static final int MARKUP_DECLARATION_HYPHEN = 39;
    155 
    156    public static final int MARKUP_DECLARATION_OCTYPE = 40;
    157 
    158    public static final int DOCTYPE_UBLIC = 41;
    159 
    160    public static final int DOCTYPE_YSTEM = 42;
    161 
    162    public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
    163 
    164    public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
    165 
    166    public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
    167 
    168    public static final int CONSUME_CHARACTER_REFERENCE = 46;
    169 
    170    public static final int CONSUME_NCR = 47;
    171 
    172    public static final int CHARACTER_REFERENCE_TAIL = 48;
    173 
    174    public static final int HEX_NCR_LOOP = 49;
    175 
    176    public static final int DECIMAL_NRC_LOOP = 50;
    177 
    178    public static final int HANDLE_NCR_VALUE = 51;
    179 
    180    public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
    181 
    182    public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
    183 
    184    public static final int SELF_CLOSING_START_TAG = 54;
    185 
    186    public static final int CDATA_START = 55;
    187 
    188    public static final int CDATA_SECTION = 56;
    189 
    190    public static final int CDATA_RSQB = 57;
    191 
    192    public static final int CDATA_RSQB_RSQB = 58;
    193 
    194    public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
    195 
    196    public static final int SCRIPT_DATA_ESCAPE_START = 60;
    197 
    198    public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
    199 
    200    public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
    201 
    202    public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
    203 
    204    public static final int BOGUS_COMMENT_HYPHEN = 64;
    205 
    206    public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
    207 
    208    public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
    209 
    210    public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
    211 
    212    public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
    213 
    214    public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
    215 
    216    public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
    217 
    218    public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
    219 
    220    public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
    221 
    222    public static final int PROCESSING_INSTRUCTION = 73;
    223 
    224    public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
    225 
    226    public static final int COMMENT_LESSTHAN = 76;
    227 
    228    public static final int COMMENT_LESSTHAN_BANG = 77;
    229 
    230    public static final int COMMENT_LESSTHAN_BANG_DASH = 78;
    231 
    232    public static final int COMMENT_LESSTHAN_BANG_DASH_DASH = 79;
    233 
    234    /**
    235     * Magic value for UTF-16 operations.
    236     */
    237    private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
    238 
    239    /**
    240     * UTF-16 code unit array containing less than and greater than for emitting
    241     * those characters on certain parse errors.
    242     */
    243    private static final @NoLength char[] LT_GT = { '<', '>' };
    244 
    245    /**
    246     * UTF-16 code unit array containing less than and solidus for emitting
    247     * those characters on certain parse errors.
    248     */
    249    private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
    250 
    251    /**
    252     * UTF-16 code unit array containing ]] for emitting those characters on
    253     * state transitions.
    254     */
    255    private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
    256 
    257    /**
    258     * Array version of U+FFFD.
    259     */
    260    private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
    261 
    262    // [NOCPP[
    263 
    264    /**
    265     * Array version of space.
    266     */
    267    private static final @NoLength char[] SPACE = { ' ' };
    268 
    269    // ]NOCPP]
    270 
    271    /**
    272     * Array version of line feed.
    273     */
    274    private static final @NoLength char[] LF = { '\n' };
    275 
    276    /**
    277     * "CDATA[" as <code>char[]</code>
    278     */
    279    private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T',
    280            'A', '[' };
    281 
    282    /**
    283     * "octype" as <code>char[]</code>
    284     */
    285    private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p',
    286            'e' };
    287 
    288    /**
    289     * "ublic" as <code>char[]</code>
    290     */
    291    private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' };
    292 
    293    /**
    294     * "ystem" as <code>char[]</code>
    295     */
    296    private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' };
    297 
    298    private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
    299 
    300    private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
    301 
    302    private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
    303 
    304    private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
    305            'e', 'x', 't' };
    306 
    307    private static final char[] XMP_ARR = { 'x', 'm', 'p' };
    308 
    309    private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
    310            'e', 'a' };
    311 
    312    private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
    313 
    314    private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
    315            'd' };
    316 
    317    private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
    318            'p', 't' };
    319 
    320    private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
    321            'e', 's' };
    322 
    323    /**
    324     * The token handler.
    325     */
    326    protected final TokenHandler tokenHandler;
    327 
    328    protected EncodingDeclarationHandler encodingDeclarationHandler;
    329 
    330    // [NOCPP[
    331 
    332    /**
    333     * The error handler.
    334     */
    335    protected ErrorHandler errorHandler;
    336 
    337    // ]NOCPP]
    338 
    339    /**
    340     * Whether the previous char read was CR.
    341     */
    342    protected boolean lastCR;
    343 
    344    protected int stateSave;
    345 
    346    private int returnStateSave;
    347 
    348    protected int index;
    349 
    350    private boolean forceQuirks;
    351 
    352    private char additional;
    353 
    354    private int entCol;
    355 
    356    private int firstCharKey;
    357 
    358    private int lo;
    359 
    360    private int hi;
    361 
    362    private int candidate;
    363 
    364    private int charRefBufMark;
    365 
    366    protected int value;
    367 
    368    private boolean seenDigits;
    369 
    370    private boolean suspendAfterCurrentNonTextToken;
    371 
    372    protected int cstart;
    373 
    374    /**
    375     * The SAX public id for the resource being tokenized. (Only passed to back
    376     * as part of locator data.)
    377     */
    378    private String publicId;
    379 
    380    /**
    381     * The SAX system id for the resource being tokenized. (Only passed to back
    382     * as part of locator data.)
    383     */
    384    private String systemId;
    385 
    386    /**
    387     * Buffer for bufferable things other than those that fit the description
    388     * of <code>charRefBuf</code>.
    389     */
    390    private @Auto char[] strBuf;
    391 
    392    /**
    393     * Number of significant <code>char</code>s in <code>strBuf</code>.
    394     */
    395    private int strBufLen;
    396 
    397    /**
    398     * Buffer for characters that might form a character reference but may
    399     * end up not forming one.
    400     */
    401    private final @Auto char[] charRefBuf;
    402 
    403    /**
    404     * Number of significant <code>char</code>s in <code>charRefBuf</code>.
    405     */
    406    private int charRefBufLen;
    407 
    408    /**
    409     * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
    410     */
    411    private final @Auto char[] bmpChar;
    412 
    413    /**
    414     * Buffer for expanding astral NCRs.
    415     */
    416    private final @Auto char[] astralChar;
    417 
    418    /**
    419     * The element whose end tag closes the current CDATA or RCDATA element.
    420     */
    421    protected ElementName endTagExpectation = null;
    422 
    423    private char[] endTagExpectationAsArray; // not @Auto!
    424 
    425    /**
    426     * <code>true</code> if tokenizing an end tag
    427     */
    428    protected boolean endTag;
    429 
    430    /**
    431     * <code>true</code> iff the current element/attribute name contains
    432     * a hyphen.
    433     */
    434    private boolean containsHyphen;
    435 
    436    /**
    437     * The current tag token name. One of
    438     * 1) null,
    439     * 2) non-owning reference to nonInternedTagName
    440     * 3) non-owning reference to a pre-interned ElementName
    441     */
    442    private ElementName tagName = null;
    443 
    444    /**
    445     * The recycled ElementName instance for the non-pre-interned cases.
    446     */
    447    private ElementName nonInternedTagName = null;
    448 
    449    /**
    450     * The current attribute name.
    451     */
    452    protected AttributeName attributeName = null;
    453 
    454    // CPPONLY: private AttributeName nonInternedAttributeName = null;
    455 
    456    // [NOCPP[
    457 
    458    /**
    459     * Whether comment tokens are emitted.
    460     */
    461    private boolean wantsComments = false;
    462 
    463    /**
    464     * Whether the stream is past the first 1024 bytes.
    465     */
    466    private boolean metaBoundaryPassed;
    467 
    468    // ]NOCPP]
    469 
    470    /**
    471     * The name of the current doctype token.
    472     */
    473    private @Local String doctypeName;
    474 
    475    /**
    476     * The public id of the current doctype token.
    477     */
    478    private String publicIdentifier;
    479 
    480    /**
    481     * The system id of the current doctype token.
    482     */
    483    private String systemIdentifier;
    484 
    485    /**
    486     * The attribute holder.
    487     */
    488    private HtmlAttributes attributes;
    489 
    490    // [NOCPP[
    491 
    492    /**
    493     * The policy for vertical tab and form feed.
    494     */
    495    private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
    496 
    497    /**
    498     * The policy for comments.
    499     */
    500    private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
    501 
    502    private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
    503 
    504    private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
    505 
    506    private int mappingLangToXmlLang;
    507 
    508    // ]NOCPP]
    509 
    510    private final boolean newAttributesEachTime;
    511 
    512    private boolean shouldSuspend;
    513 
    514    private boolean keepBuffer;
    515 
    516    protected boolean confident;
    517 
    518    private int line;
    519 
    520    /*
    521     * The line number of the current attribute. First set to the line of the
    522     * attribute name and if there is a value, set to the line the value
    523     * started on.
    524     */
    525    // CPPONLY: private int attributeLine;
    526 
    527    private Interner interner;
    528 
    529    // CPPONLY: private boolean viewingXmlSource;
    530 
    531    // [NOCPP[
    532 
    533    protected LocatorImpl ampersandLocation;
    534 
    535    public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
    536        this.tokenHandler = tokenHandler;
    537        this.encodingDeclarationHandler = null;
    538        this.lastCR = false;
    539        this.stateSave = 0;
    540        this.returnStateSave = 0;
    541        this.index = 0;
    542        this.forceQuirks = false;
    543        this.additional = '\u0000';
    544        this.entCol = 0;
    545        this.firstCharKey = 0;
    546        this.lo = 0;
    547        this.hi = 0;
    548        this.candidate = 0;
    549        this.charRefBufMark = 0;
    550        this.value = 0;
    551        this.seenDigits = false;
    552        this.suspendAfterCurrentNonTextToken = false;
    553        this.cstart = 0;
    554        this.strBufLen = 0;
    555        this.newAttributesEachTime = newAttributesEachTime;
    556        // &CounterClockwiseContourIntegral; is the longest valid char ref and
    557        // the semicolon never gets appended to the buffer.
    558        this.charRefBuf = new char[32];
    559        this.charRefBufLen = 0;
    560        this.bmpChar = new char[1];
    561        this.astralChar = new char[2];
    562        this.endTagExpectation = null;
    563        this.endTagExpectationAsArray = null;
    564        this.endTag = false;
    565        this.containsHyphen = false;
    566        this.tagName = null;
    567        this.nonInternedTagName = new ElementName();
    568        this.attributeName = null;
    569        // CPPONLY: this.nonInternedAttributeName = new AttributeName();
    570        this.doctypeName = null;
    571        this.publicIdentifier = null;
    572        this.systemIdentifier = null;
    573        this.attributes = null;
    574        this.shouldSuspend = false;
    575        this.keepBuffer = false;
    576        this.confident = false;
    577        this.line = 0;
    578        // CPPONLY: this.attributeLine = 0;
    579        this.interner = null;
    580    }
    581 
    582    // ]NOCPP]
    583 
    584    /**
    585     * The constructor.
    586     *
    587     * @param tokenHandler
    588     *            the handler for receiving tokens
    589     */
    590    public Tokenizer(TokenHandler tokenHandler
    591    // CPPONLY: , boolean viewingXmlSource
    592    ) {
    593        this.tokenHandler = tokenHandler;
    594        this.encodingDeclarationHandler = null;
    595        // [NOCPP[
    596        this.newAttributesEachTime = false;
    597        // ]NOCPP]
    598        this.lastCR = false;
    599        this.stateSave = 0;
    600        this.returnStateSave = 0;
    601        this.index = 0;
    602        this.forceQuirks = false;
    603        this.additional = '\u0000';
    604        this.entCol = 0;
    605        this.firstCharKey = 0;
    606        this.lo = 0;
    607        this.hi = 0;
    608        this.candidate = 0;
    609        this.charRefBufMark = 0;
    610        this.value = 0;
    611        this.seenDigits = false;
    612        this.suspendAfterCurrentNonTextToken = false;
    613        this.cstart = 0;
    614        this.strBufLen = 0;
    615        // &CounterClockwiseContourIntegral; is the longest valid char ref and
    616        // the semicolon never gets appended to the buffer.
    617        this.charRefBuf = new char[32];
    618        this.charRefBufLen = 0;
    619        this.bmpChar = new char[1];
    620        this.astralChar = new char[2];
    621        this.endTagExpectation = null;
    622        this.endTagExpectationAsArray = null;
    623        this.endTag = false;
    624        this.containsHyphen = false;
    625        this.tagName = null;
    626        this.nonInternedTagName = new ElementName();
    627        this.attributeName = null;
    628        // CPPONLY: this.nonInternedAttributeName = new AttributeName();
    629        this.doctypeName = null;
    630        this.publicIdentifier = null;
    631        this.systemIdentifier = null;
    632        // [NOCPP[
    633        this.attributes = null;
    634        // ]NOCPP]
    635        // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null;
    636        // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder();
    637        this.shouldSuspend = false;
    638        this.keepBuffer = false;
    639        this.confident = false;
    640        this.line = 0;
    641        // CPPONLY: this.attributeLine = 0;
    642        this.interner = null;
    643        // CPPONLY: this.viewingXmlSource = viewingXmlSource;
    644    }
    645 
    646    public void setInterner(Interner interner) {
    647        this.interner = interner;
    648    }
    649 
    650    public void initLocation(String newPublicId, String newSystemId) {
    651        this.systemId = newSystemId;
    652        this.publicId = newPublicId;
    653 
    654    }
    655 
    656    // CPPONLY: boolean isViewingXmlSource() {
    657    // CPPONLY: return viewingXmlSource;
    658    // CPPONLY: }
    659 
    660    public void setKeepBuffer(boolean keepBuffer) {
    661        this.keepBuffer = keepBuffer;
    662    }
    663 
    664    public boolean dropBufferIfLongerThan(int length) {
    665        if (strBuf.length > length) {
    666            strBuf = null;
    667            return true;
    668        }
    669        return false;
    670    }
    671 
    672    // [NOCPP[
    673 
    674    /**
    675     * Returns the mappingLangToXmlLang.
    676     *
    677     * @return the mappingLangToXmlLang
    678     */
    679    public boolean isMappingLangToXmlLang() {
    680        return mappingLangToXmlLang == AttributeName.HTML_LANG;
    681    }
    682 
    683    /**
    684     * Sets the mappingLangToXmlLang.
    685     *
    686     * @param mappingLangToXmlLang
    687     *            the mappingLangToXmlLang to set
    688     */
    689    public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
    690        this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
    691                : AttributeName.HTML;
    692    }
    693 
    694    /**
    695     * Sets the error handler.
    696     *
    697     * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
    698     */
    699    public void setErrorHandler(ErrorHandler eh) {
    700        this.errorHandler = eh;
    701    }
    702 
    703    public ErrorHandler getErrorHandler() {
    704        return this.errorHandler;
    705    }
    706 
    707    /**
    708     * Gets the errorProfile.
    709     *
    710     * @param errorProfile
    711     */
    712    public HashMap getErrorProfile() {
    713        return null;
    714    }
    715 
    716    /**
    717     * Sets the commentPolicy.
    718     *
    719     * @param commentPolicy
    720     *            the commentPolicy to set
    721     */
    722    public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
    723        this.commentPolicy = commentPolicy;
    724    }
    725 
    726    /**
    727     * Sets the contentNonXmlCharPolicy.
    728     *
    729     * @param contentNonXmlCharPolicy
    730     *            the contentNonXmlCharPolicy to set
    731     */
    732    public void setContentNonXmlCharPolicy(
    733            XmlViolationPolicy contentNonXmlCharPolicy) {
    734        if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
    735            throw new IllegalArgumentException(
    736                    "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
    737        }
    738    }
    739 
    740    /**
    741     * Sets the contentSpacePolicy.
    742     *
    743     * @param contentSpacePolicy
    744     *            the contentSpacePolicy to set
    745     */
    746    public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
    747        this.contentSpacePolicy = contentSpacePolicy;
    748    }
    749 
    750    /**
    751     * Sets the xmlnsPolicy.
    752     *
    753     * @param xmlnsPolicy
    754     *            the xmlnsPolicy to set
    755     */
    756    public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
    757        if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
    758            throw new IllegalArgumentException("Can't use FATAL here.");
    759        }
    760        this.xmlnsPolicy = xmlnsPolicy;
    761    }
    762 
    763    public void setNamePolicy(XmlViolationPolicy namePolicy) {
    764        this.namePolicy = namePolicy;
    765    }
    766 
    767    // ]NOCPP]
    768 
    769    // For the token handler to call
    770 
    771    /**
    772     * Sets the tokenizer state and the associated element name. This should
    773     * only ever used to put the tokenizer into one of the states that have
    774     * a special end tag expectation.
    775     *
    776     * @param specialTokenizerState
    777     *            the tokenizer state to set
    778     */
    779    public void setState(int specialTokenizerState) {
    780        this.stateSave = specialTokenizerState;
    781        this.endTagExpectation = null;
    782        this.endTagExpectationAsArray = null;
    783    }
    784 
    785    // [NOCPP[
    786 
    787    /**
    788     * Sets the tokenizer state and the associated element name. This should
    789     * only ever used to put the tokenizer into one of the states that have
    790     * a special end tag expectation. For use from the tokenizer test harness.
    791     *
    792     * @param specialTokenizerState
    793     *            the tokenizer state to set
    794     * @param endTagExpectation
    795     *            the expected end tag for transitioning back to normal
    796     */
    797    public void setStateAndEndTagExpectation(int specialTokenizerState,
    798            @Local String endTagExpectation) {
    799        this.stateSave = specialTokenizerState;
    800        if (specialTokenizerState == Tokenizer.DATA) {
    801            return;
    802        }
    803        @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
    804        this.endTagExpectation = ElementName.elementNameByBuffer(asArray,
    805                asArray.length);
    806        assert this.endTagExpectation != null;
    807        endTagExpectationToArray();
    808    }
    809 
    810    // ]NOCPP]
    811 
    812    /**
    813     * Sets the tokenizer state and the associated element name. This should
    814     * only ever used to put the tokenizer into one of the states that have
    815     * a special end tag expectation.
    816     *
    817     * @param specialTokenizerState
    818     *            the tokenizer state to set
    819     * @param endTagExpectation
    820     *            the expected end tag for transitioning back to normal
    821     */
    822    public void setStateAndEndTagExpectation(int specialTokenizerState,
    823            ElementName endTagExpectation) {
    824        this.stateSave = specialTokenizerState;
    825        this.endTagExpectation = endTagExpectation;
    826        endTagExpectationToArray();
    827    }
    828 
    829    private void endTagExpectationToArray() {
    830        switch (endTagExpectation.getGroup()) {
    831            case TreeBuilder.TITLE:
    832                endTagExpectationAsArray = TITLE_ARR;
    833                return;
    834            case TreeBuilder.SCRIPT:
    835                endTagExpectationAsArray = SCRIPT_ARR;
    836                return;
    837            case TreeBuilder.STYLE:
    838                endTagExpectationAsArray = STYLE_ARR;
    839                return;
    840            case TreeBuilder.PLAINTEXT:
    841                endTagExpectationAsArray = PLAINTEXT_ARR;
    842                return;
    843            case TreeBuilder.XMP:
    844                endTagExpectationAsArray = XMP_ARR;
    845                return;
    846            case TreeBuilder.TEXTAREA:
    847                endTagExpectationAsArray = TEXTAREA_ARR;
    848                return;
    849            case TreeBuilder.IFRAME:
    850                endTagExpectationAsArray = IFRAME_ARR;
    851                return;
    852            case TreeBuilder.NOEMBED:
    853                endTagExpectationAsArray = NOEMBED_ARR;
    854                return;
    855            case TreeBuilder.NOSCRIPT:
    856                endTagExpectationAsArray = NOSCRIPT_ARR;
    857                return;
    858            case TreeBuilder.NOFRAMES:
    859                endTagExpectationAsArray = NOFRAMES_ARR;
    860                return;
    861            default:
    862                assert false: "Bad end tag expectation.";
    863                return;
    864        }
    865    }
    866 
    867    /**
    868     * For C++ use only.
    869     */
    870    public void setLineNumber(int line) {
    871        // CPPONLY: this.attributeLine = line; // XXX is this needed?
    872        this.line = line;
    873    }
    874 
    875    // start Locator impl
    876 
    877    /**
    878     * @see org.xml.sax.Locator#getLineNumber()
    879     */
    880    @Inline public int getLineNumber() {
    881        return line;
    882    }
    883 
    884    // [NOCPP[
    885 
    886    /**
    887     * @see org.xml.sax.Locator#getColumnNumber()
    888     */
    889    @Inline public int getColumnNumber() {
    890        return -1;
    891    }
    892 
    893    /**
    894     * @see org.xml.sax.Locator#getPublicId()
    895     */
    896    public String getPublicId() {
    897        return publicId;
    898    }
    899 
    900    /**
    901     * @see org.xml.sax.Locator#getSystemId()
    902     */
    903    public String getSystemId() {
    904        return systemId;
    905    }
    906 
    907    /**
    908     * @see org.xml.sax.ext.Locator2#getXMLVersion()
    909     */
    910    public String getXMLVersion() {
    911        return "1.0";
    912    }
    913 
    914    /**
    915     * @see org.xml.sax.ext.Locator2#getXMLVersion()
    916     */
    917    public String getEncoding() {
    918        try {
    919            return encodingDeclarationHandler == null ? null : encodingDeclarationHandler.getCharacterEncoding();
    920        } catch (SAXException e) {
    921            return null;
    922        }
    923    }
    924 
    925    // end Locator impl
    926 
    927    // end public API
    928 
    929    public void notifyAboutMetaBoundary() {
    930        metaBoundaryPassed = true;
    931    }
    932 
    933    // ]NOCPP]
    934 
    935    @Inline HtmlAttributes emptyAttributes() {
    936        // [NOCPP[
    937        if (newAttributesEachTime) {
    938            return new HtmlAttributes(mappingLangToXmlLang);
    939        } else {
    940            // ]NOCPP]
    941            return HtmlAttributes.EMPTY_ATTRIBUTES;
    942            // [NOCPP[
    943        }
    944        // ]NOCPP]
    945    }
    946 
    947    private void appendCharRefBuf(char c) {
    948        // CPPONLY: assert charRefBufLen < charRefBuf.length:
    949        // CPPONLY:     "RELEASE: Attempted to overrun charRefBuf!";
    950        charRefBuf[charRefBufLen++] = c;
    951    }
    952 
    953    private void emitOrAppendCharRefBuf(int returnState) throws SAXException {
    954        if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
    955            appendCharRefBufToStrBuf();
    956        } else {
    957            if (charRefBufLen > 0) {
    958                tokenHandler.characters(charRefBuf, 0, charRefBufLen);
    959                charRefBufLen = 0;
    960            }
    961        }
    962    }
    963 
    964    @Inline private void clearStrBufAfterUse() {
    965        strBufLen = 0;
    966    }
    967 
    968    @Inline private void clearStrBufBeforeUse() {
    969        assert strBufLen == 0: "strBufLen not reset after previous use!";
    970        strBufLen = 0; // no-op in the absence of bugs
    971    }
    972 
    973    @Inline private void clearStrBufAfterOneHyphen() {
    974        assert strBufLen == 1: "strBufLen length not one!";
    975        assert strBuf[0] == '-': "strBuf does not start with a hyphen!";
    976        strBufLen = 0;
    977    }
    978 
    979    /**
    980     * Appends to the buffer.
    981     *
    982     * @param c
    983     *            the UTF-16 code unit to append
    984     */
    985    @Inline private void appendStrBuf(char c) {
    986        // CPPONLY: if (strBufLen == strBuf.length) {
    987        // CPPONLY:     EnsureBufferSpaceShouldNeverHappen(1);
    988        // CPPONLY: }
    989        strBuf[strBufLen++] = c;
    990    }
    991 
    992    /**
    993     * The buffer as a String. Currently only used for error reporting.
    994     *
    995     * <p>
    996     * C++ memory note: The return value must be released.
    997     *
    998     * @return the buffer as a string
    999     */
   1000    @Inline protected String strBufToString() {
   1001        // CPPONLY: String digitAtom = TryAtomizeForSingleDigit();
   1002        // CPPONLY: if (digitAtom) {
   1003        // CPPONLY:   return digitAtom;
   1004        // CPPONLY: }
   1005        // CPPONLY:
   1006        // CPPONLY: boolean maybeAtomize = false;
   1007        // CPPONLY: if (!newAttributesEachTime) {
   1008        // CPPONLY:   if (attributeName == AttributeName.CLASS ||
   1009        // CPPONLY:       attributeName == AttributeName.TYPE) {
   1010        // CPPONLY:     maybeAtomize = true;
   1011        // CPPONLY:   }
   1012        // CPPONLY: }
   1013        // CPPONLY:
   1014        String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen
   1015            // CPPONLY: , tokenHandler, maybeAtomize
   1016        );
   1017        clearStrBufAfterUse();
   1018        return str;
   1019    }
   1020 
   1021    /**
   1022     * Returns the buffer as a local name. The return value is released in
   1023     * emitDoctypeToken().
   1024     *
   1025     * @return the buffer as local name
   1026     */
   1027    @Inline private void strBufToDoctypeName() {
   1028        doctypeName = Portability.newLocalNameFromBuffer(strBuf, strBufLen, interner);
   1029        clearStrBufAfterUse();
   1030    }
   1031 
   1032    /**
   1033     * Emits the buffer as character tokens.
   1034     *
   1035     * @throws SAXException
   1036     *             if the token handler threw
   1037     */
   1038    @Inline private void emitStrBuf() throws SAXException {
   1039        if (strBufLen > 0) {
   1040            tokenHandler.characters(strBuf, 0, strBufLen);
   1041            clearStrBufAfterUse();
   1042        }
   1043    }
   1044 
   1045    @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
   1046        // [NOCPP[
   1047        switch (commentPolicy) {
   1048            case ALTER_INFOSET:
   1049                appendStrBuf(' ');
   1050                // CPPONLY: MOZ_FALLTHROUGH;
   1051            case ALLOW:
   1052                warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
   1053                // ]NOCPP]
   1054                appendStrBuf('-');
   1055                // [NOCPP[
   1056                break;
   1057            case FATAL:
   1058                fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
   1059                break;
   1060        }
   1061        // ]NOCPP]
   1062    }
   1063 
   1064    // [NOCPP[
   1065    private void maybeAppendSpaceToBogusComment() throws SAXException {
   1066        switch (commentPolicy) {
   1067            case ALTER_INFOSET:
   1068                appendStrBuf(' ');
   1069                // CPPONLY: MOZ_FALLTHROUGH;
   1070            case ALLOW:
   1071                warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
   1072                break;
   1073            case FATAL:
   1074                fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
   1075                break;
   1076        }
   1077    }
   1078 
   1079    // ]NOCPP]
   1080 
   1081    @Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c, boolean reportedConsecutiveHyphens)
   1082            throws SAXException {
   1083        // [NOCPP[
   1084        switch (commentPolicy) {
   1085            case ALTER_INFOSET:
   1086                strBufLen--;
   1087                // WARNING!!! This expands the worst case of the buffer length
   1088                // given the length of input!
   1089                appendStrBuf(' ');
   1090                appendStrBuf('-');
   1091                // CPPONLY: MOZ_FALLTHROUGH;
   1092            case ALLOW:
   1093                if (!reportedConsecutiveHyphens) {
   1094                    warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
   1095                }
   1096                // ]NOCPP]
   1097                appendStrBuf(c);
   1098                // [NOCPP[
   1099                break;
   1100            case FATAL:
   1101                fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
   1102                break;
   1103        }
   1104        // ]NOCPP]
   1105    }
   1106 
   1107    @Inline private void appendStrBuf(@NoLength char[] buffer, int offset, int length) throws SAXException {
   1108        // Years of crash stats have shown that the this addition doesn't overflow, as it logically
   1109        // shouldn't.
   1110        int newLen = strBufLen + length;
   1111        // CPPONLY: if (strBuf.length < newLen) {
   1112        // CPPONLY:     EnsureBufferSpaceShouldNeverHappen(length);
   1113        // CPPONLY: }
   1114        System.arraycopy(buffer, offset, strBuf, strBufLen, length);
   1115        strBufLen = newLen;
   1116    }
   1117 
   1118    /**
   1119     * Append the contents of the char reference buffer to the main one.
   1120     */
   1121    @Inline private void appendCharRefBufToStrBuf() throws SAXException {
   1122        appendStrBuf(charRefBuf, 0, charRefBufLen);
   1123        charRefBufLen = 0;
   1124    }
   1125 
   1126    /**
   1127     * Emits the current comment token.
   1128     *
   1129     * NOTE: The method may set <code>shouldSuspend</code>, so the caller
   1130     * must have this pattern after the state's <code>transition</code> call:
   1131     *
   1132     * <pre>
   1133     * if (shouldSuspend) {
   1134     *     break stateloop;
   1135     * }
   1136     * continue stateloop;
   1137     * </pre>
   1138     *
   1139     * @param pos
   1140     *            TODO
   1141     *
   1142     * @throws SAXException
   1143     */
   1144    private void emitComment(int provisionalHyphens, int pos)
   1145            throws SAXException {
   1146        // CPPONLY: RememberGt(pos);
   1147        // [NOCPP[
   1148        if (wantsComments) {
   1149            // ]NOCPP]
   1150            tokenHandler.comment(strBuf, 0, strBufLen
   1151                    - provisionalHyphens);
   1152            // [NOCPP[
   1153        }
   1154        // ]NOCPP]
   1155        clearStrBufAfterUse();
   1156        cstart = pos + 1;
   1157        suspendIfRequestedAfterCurrentNonTextToken();
   1158    }
   1159 
   1160    /**
   1161     * Flushes coalesced character tokens.
   1162     *
   1163     * @param buf
   1164     *            TODO
   1165     * @param pos
   1166     *            TODO
   1167     *
   1168     * @throws SAXException
   1169     */
   1170    protected void flushChars(@NoLength char[] buf, int pos)
   1171            throws SAXException {
   1172        if (pos > cstart) {
   1173            tokenHandler.characters(buf, cstart, pos - cstart);
   1174        }
   1175        cstart = Integer.MAX_VALUE;
   1176    }
   1177 
   1178    /**
   1179     * Reports an condition that would make the infoset incompatible with XML
   1180     * 1.0 as fatal.
   1181     *
   1182     * @param message
   1183     *            the message
   1184     * @throws SAXException
   1185     * @throws SAXParseException
   1186     */
   1187    public void fatal(String message) throws SAXException {
   1188        SAXParseException spe = new SAXParseException(message, this);
   1189        if (errorHandler != null) {
   1190            errorHandler.fatalError(spe);
   1191        }
   1192        throw spe;
   1193    }
   1194 
   1195    /**
   1196     * Reports a Parse Error.
   1197     *
   1198     * @param message
   1199     *            the message
   1200     * @throws SAXException
   1201     */
   1202    public void err(String message) throws SAXException {
   1203        if (errorHandler == null) {
   1204            return;
   1205        }
   1206        SAXParseException spe = new SAXParseException(message, this);
   1207        errorHandler.error(spe);
   1208    }
   1209 
   1210    public void errTreeBuilder(String message) throws SAXException {
   1211        ErrorHandler eh = null;
   1212        if (tokenHandler instanceof TreeBuilder<?>) {
   1213            TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
   1214            eh = treeBuilder.getErrorHandler();
   1215        }
   1216        if (eh == null) {
   1217            eh = errorHandler;
   1218        }
   1219        if (eh == null) {
   1220            return;
   1221        }
   1222        SAXParseException spe = new SAXParseException(message, this);
   1223        eh.error(spe);
   1224    }
   1225 
   1226    /**
   1227     * Reports a warning
   1228     *
   1229     * @param message
   1230     *            the message
   1231     * @throws SAXException
   1232     */
   1233    public void warn(String message) throws SAXException {
   1234        if (errorHandler == null) {
   1235            return;
   1236        }
   1237        SAXParseException spe = new SAXParseException(message, this);
   1238        errorHandler.warning(spe);
   1239    }
   1240 
   1241    private void strBufToElementNameString() {
   1242        if (containsHyphen) {
   1243            // We've got a custom element or annotation-xml.
   1244            @Local String annotationName = ElementName.ANNOTATION_XML.getName();
   1245            if (Portability.localEqualsBuffer(annotationName, strBuf, strBufLen)) {
   1246                tagName = ElementName.ANNOTATION_XML;
   1247            } else {
   1248                nonInternedTagName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen,
   1249                        interner)
   1250                        // CPPONLY: , true
   1251                        );
   1252                tagName = nonInternedTagName;
   1253            }
   1254        } else {
   1255            tagName = ElementName.elementNameByBuffer(strBuf, strBufLen);
   1256            if (tagName == null) {
   1257                nonInternedTagName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen,
   1258                    interner)
   1259                        // CPPONLY: , false
   1260                        );
   1261                tagName = nonInternedTagName;
   1262            }
   1263        }
   1264        containsHyphen = false;
   1265        clearStrBufAfterUse();
   1266    }
   1267 
   1268    /**
   1269     * Emits a tag token.
   1270     *
   1271     * NOTE: The method may set <code>shouldSuspend</code>, so the caller
   1272     * must have this pattern after the state's <code>transition</code> call:
   1273     * <pre>
   1274     * if (shouldSuspend) {
   1275     *     break stateloop;
   1276     * }
   1277     * continue stateloop;
   1278     * </pre>
   1279     *
   1280     * @param selfClosing
   1281     * @param pos
   1282     * @return
   1283     * @throws SAXException
   1284     */
   1285    private int emitCurrentTagToken(boolean selfClosing, int pos)
   1286            throws SAXException {
   1287        // CPPONLY: RememberGt(pos);
   1288        cstart = pos + 1;
   1289        maybeErrSlashInEndTag(selfClosing);
   1290        stateSave = Tokenizer.DATA;
   1291        HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
   1292                : attributes);
   1293        if (endTag) {
   1294            /*
   1295             * When an end tag token is emitted, the content model flag must be
   1296             * switched to the PCDATA state.
   1297             */
   1298            maybeErrAttributesOnEndTag(attrs);
   1299            // CPPONLY: if (!viewingXmlSource) {
   1300            tokenHandler.endTag(tagName);
   1301            // CPPONLY: }
   1302            // CPPONLY: if (newAttributesEachTime) {
   1303            // CPPONLY:   Portability.delete(attributes);
   1304            // CPPONLY:   attributes = null;
   1305            // CPPONLY: }
   1306        } else {
   1307            // CPPONLY: if (viewingXmlSource) {
   1308            // CPPONLY:   assert newAttributesEachTime;
   1309            // CPPONLY:   Portability.delete(attributes);
   1310            // CPPONLY:   attributes = null;
   1311            // CPPONLY: } else {
   1312            tokenHandler.startTag(tagName, attrs, selfClosing);
   1313            // CPPONLY: }
   1314        }
   1315        tagName = null;
   1316        if (newAttributesEachTime) {
   1317            attributes = null;
   1318        } else {
   1319            attributes.clear(mappingLangToXmlLang);
   1320        }
   1321        /*
   1322         * The token handler may have called setStateAndEndTagExpectation
   1323         * and changed stateSave since the start of this method.
   1324         */
   1325        suspendIfRequestedAfterCurrentNonTextToken();
   1326        return stateSave;
   1327    }
   1328 
   1329    private void attributeNameComplete() throws SAXException {
   1330        attributeName = AttributeName.nameByBuffer(strBuf, strBufLen, interner);
   1331        if (attributeName == null) {
   1332            // [NOCPP[
   1333            attributeName = AttributeName.createAttributeName(
   1334                    Portability.newLocalNameFromBuffer(strBuf, strBufLen,
   1335                            interner),
   1336                    namePolicy != XmlViolationPolicy.ALLOW);
   1337            // ]NOCPP]
   1338            // CPPONLY:     nonInternedAttributeName.setNameForNonInterned(Portability.newLocalNameFromBuffer(strBuf, strBufLen, interner));
   1339            // CPPONLY:     attributeName = nonInternedAttributeName;
   1340        }
   1341        clearStrBufAfterUse();
   1342 
   1343        if (attributes == null) {
   1344            attributes = new HtmlAttributes(mappingLangToXmlLang);
   1345        }
   1346 
   1347        /*
   1348         * When the user agent leaves the attribute name state (and before
   1349         * emitting the tag token, if appropriate), the complete attribute's
   1350         * name must be compared to the other attributes on the same token; if
   1351         * there is already an attribute on the token with the exact same name,
   1352         * then this is a parse error and the new attribute must be dropped,
   1353         * along with the value that gets associated with it (if any).
   1354         */
   1355        if (attributes.contains(attributeName)) {
   1356            errDuplicateAttribute();
   1357            attributeName = null;
   1358        }
   1359    }
   1360 
   1361    private void addAttributeWithoutValue() throws SAXException {
   1362        noteAttributeWithoutValue();
   1363 
   1364        // [NOCPP[
   1365        if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
   1366                && ElementName.META == tagName) {
   1367            err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 1024 bytes.");
   1368        }
   1369        // ]NOCPP]
   1370        if (attributeName != null) {
   1371            // [NOCPP[
   1372            if (AttributeName.SRC == attributeName
   1373                    || AttributeName.HREF == attributeName) {
   1374                warn("Attribute \u201C"
   1375                        + attributeName.getLocal(AttributeName.HTML)
   1376                        + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
   1377            }
   1378            // ]NOCPP]
   1379            attributes.addAttribute(attributeName,
   1380                    Portability.newEmptyString()
   1381                    // [NOCPP[
   1382                    , xmlnsPolicy
   1383            // ]NOCPP]
   1384            // CPPONLY: , attributeLine
   1385            );
   1386            attributeName = null;
   1387        } else {
   1388            clearStrBufAfterUse();
   1389        }
   1390    }
   1391 
   1392    private void addAttributeWithValue() throws SAXException {
   1393        // [NOCPP[
   1394        if (metaBoundaryPassed && ElementName.META == tagName
   1395                && AttributeName.CHARSET == attributeName) {
   1396            err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 1024 bytes.");
   1397        }
   1398        // ]NOCPP]
   1399        if (attributeName != null) {
   1400            String val = strBufToString(); // Ownership transferred to
   1401            // HtmlAttributes
   1402            // CPPONLY: if (mViewSource) {
   1403            // CPPONLY:   mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
   1404            // CPPONLY: }
   1405            attributes.addAttribute(attributeName, val
   1406            // [NOCPP[
   1407                    , xmlnsPolicy
   1408            // ]NOCPP]
   1409            // CPPONLY: , attributeLine
   1410            );
   1411            attributeName = null;
   1412        } else {
   1413            // We have a duplicate attribute. Explicitly discard its value.
   1414            clearStrBufAfterUse();
   1415        }
   1416    }
   1417 
   1418    // [NOCPP[
   1419 
   1420    protected void startErrorReporting() throws SAXException {
   1421 
   1422    }
   1423 
   1424    // ]NOCPP]
   1425 
   1426    public void start() throws SAXException {
   1427        initializeWithoutStarting();
   1428        tokenHandler.startTokenization(this);
   1429        // CPPONLY: if (mViewSource) {
   1430        // CPPONLY:   line = 1;
   1431        // CPPONLY:   col = -1;
   1432        // CPPONLY:   nextCharOnNewLine = false;
   1433        // CPPONLY: } else if (tokenHandler.WantsLineAndColumn()) {
   1434        // CPPONLY:   line = 0;
   1435        // CPPONLY:   col = 1;
   1436        // CPPONLY:   nextCharOnNewLine = true;
   1437        // CPPONLY: } else {
   1438        // CPPONLY:   line = -1;
   1439        // CPPONLY:   col = -1;
   1440        // CPPONLY:   nextCharOnNewLine = false;
   1441        // CPPONLY: }
   1442        // [NOCPP[
   1443        startErrorReporting();
   1444        // ]NOCPP]
   1445    }
   1446 
   1447    public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
   1448        int state = stateSave;
   1449        int returnState = returnStateSave;
   1450        char c = '\u0000';
   1451        shouldSuspend = false;
   1452        lastCR = false;
   1453 
   1454        int start = buffer.getStart();
   1455        int end = buffer.getEnd();
   1456 
   1457        // In C++, the caller of tokenizeBuffer needs to do this explicitly.
   1458        // [NOCPP[
   1459        ensureBufferSpace(end - start);
   1460        // ]NOCPP]
   1461 
   1462        /**
   1463         * The index of the last <code>char</code> read from <code>buf</code>.
   1464         */
   1465        int pos = start - 1;
   1466 
   1467        switch (state) {
   1468            case DATA:
   1469            case RCDATA:
   1470            case SCRIPT_DATA:
   1471            case PLAINTEXT:
   1472            case RAWTEXT:
   1473            case CDATA_SECTION:
   1474            case SCRIPT_DATA_ESCAPED:
   1475            case SCRIPT_DATA_ESCAPE_START:
   1476            case SCRIPT_DATA_ESCAPE_START_DASH:
   1477            case SCRIPT_DATA_ESCAPED_DASH:
   1478            case SCRIPT_DATA_ESCAPED_DASH_DASH:
   1479            case SCRIPT_DATA_DOUBLE_ESCAPE_START:
   1480            case SCRIPT_DATA_DOUBLE_ESCAPED:
   1481            case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
   1482            case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
   1483            case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
   1484            case SCRIPT_DATA_DOUBLE_ESCAPE_END:
   1485                cstart = start;
   1486                break;
   1487            default:
   1488                cstart = Integer.MAX_VALUE;
   1489                break;
   1490        }
   1491 
   1492        // CPPONLY: if (mViewSource) {
   1493        // CPPONLY:   mViewSource.SetBuffer(buffer);
   1494        // CPPONLY:   if (htmlaccelEnabled()) {
   1495        // CPPONLY:     pos = StateLoopViewSourceSIMD(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
   1496        // CPPONLY:   } else {
   1497        // CPPONLY:     pos = StateLoopViewSourceALU(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
   1498        // CPPONLY:   }
   1499        // CPPONLY:   mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
   1500        // CPPONLY: } else if (tokenHandler.WantsLineAndColumn()) {
   1501        // CPPONLY:   if (htmlaccelEnabled()) {
   1502        // CPPONLY:     pos = StateLoopLineColSIMD(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
   1503        // CPPONLY:   } else {
   1504        // CPPONLY:     pos = StateLoopLineColALU(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
   1505        // CPPONLY:   }
   1506        // CPPONLY: } else if (htmlaccelEnabled()) {
   1507        // CPPONLY:   pos = StateLoopFastestSIMD(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
   1508        // CPPONLY: } else {
   1509        // CPPONLY:   pos = StateLoopFastestALU(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
   1510        // CPPONLY: }
   1511        // [NOCPP[
   1512        pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
   1513                end);
   1514        // ]NOCPP]
   1515        if (pos == end) {
   1516            // exiting due to end of buffer
   1517            buffer.setStart(pos);
   1518        } else {
   1519            buffer.setStart(pos + 1);
   1520        }
   1521        return lastCR;
   1522    }
   1523 
   1524    // [NOCPP[
   1525    private void ensureBufferSpace(int inputLength) throws SAXException {
   1526        // Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB.
   1527        // Adding to the general worst case instead of only the
   1528        // TreeBuilder-exposed worst case to avoid re-introducing a bug when
   1529        // unifying the tokenizer and tree builder buffers in the future.
   1530        int worstCase = strBufLen + inputLength + charRefBufLen + 2;
   1531        tokenHandler.ensureBufferSpace(worstCase);
   1532        if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) {
   1533            // When altering infoset, if the comment contents are consecutive
   1534            // hyphens, each hyphen generates a space, too. These buffer
   1535            // contents never get emitted as characters() to the tokenHandler,
   1536            // which is why this calculation happens after the call to
   1537            // ensureBufferSpace on tokenHandler.
   1538            worstCase *= 2;
   1539        }
   1540        if (strBuf == null) {
   1541            // Add an arbitrary small value to avoid immediate reallocation
   1542            // once there are a few characters in the buffer.
   1543            strBuf = new char[worstCase + 128];
   1544        } else if (worstCase > strBuf.length) {
   1545            // HotSpot reportedly allocates memory with 8-byte accuracy, so
   1546            // there's no point in trying to do math here to avoid slop.
   1547            // Maybe we should add some small constant to worstCase here
   1548            // but not doing that without profiling. In C++ with jemalloc,
   1549            // the corresponding method should do math to round up here
   1550            // to avoid slop.
   1551            char[] newBuf = new char[Math.max(worstCase, (strBuf.length*5)/4)];
   1552            System.arraycopy(strBuf, 0, newBuf, 0, strBufLen);
   1553            strBuf = newBuf;
   1554        }
   1555    }
   1556    // ]NOCPP]
   1557 
   1558    @SuppressWarnings("unused") @Inline private int stateLoop(int state, char c,
   1559            int pos, @NoLength char[] buf, boolean reconsume, int returnState,
   1560            int endPos) throws SAXException {
   1561        boolean reportedConsecutiveHyphens = false;
   1562        /*
   1563         * Idioms used in this code:
   1564         *
   1565         *
   1566         * Consuming the next input character
   1567         *
   1568         * To consume the next input character, the code does this: if (++pos ==
   1569         * endPos) { break stateloop; } c = checkChar(buf, pos);
   1570         *
   1571         *
   1572         * Staying in a state
   1573         *
   1574         * When there's a state that the tokenizer may stay in over multiple
   1575         * input characters, the state has a wrapper |for(;;)| loop and staying
   1576         * in the state continues the loop.
   1577         *
   1578         *
   1579         * Switching to another state
   1580         *
   1581         * To switch to another state, the code sets the state variable to the
   1582         * magic number of the new state. Then it either continues stateloop or
   1583         * breaks out of the state's own wrapper loop if the target state is
   1584         * right after the current state in source order. (This is a partial
   1585         * workaround for Java's lack of goto.)
   1586         *
   1587         *
   1588         * Reconsume support
   1589         *
   1590         * The spec sometimes says that an input character is reconsumed in
   1591         * another state. If a state can ever be entered so that an input
   1592         * character can be reconsumed in it, the state's code starts with an
   1593         * |if (reconsume)| that sets reconsume to false and skips over the
   1594         * normal code for consuming a new character.
   1595         *
   1596         * To reconsume the current character in another state, the code sets
   1597         * |reconsume| to true and then switches to the other state.
   1598         *
   1599         *
   1600         * Emitting character tokens
   1601         *
   1602         * This method emits character tokens lazily. Whenever a new range of
   1603         * character tokens starts, the field cstart must be set to the start
   1604         * index of the range. The flushChars() method must be called at the end
   1605         * of a range to flush it.
   1606         *
   1607         *
   1608         * U+0000 handling
   1609         *
   1610         * The various states have to handle the replacement of U+0000 with
   1611         * U+FFFD. However, if U+0000 would be reconsumed in another state, the
   1612         * replacement doesn't need to happen, because it's handled by the
   1613         * reconsuming state.
   1614         *
   1615         *
   1616         * LF handling
   1617         *
   1618         * Every state needs to increment the line number upon LF unless the LF
   1619         * gets reconsumed by another state which increments the line number.
   1620         *
   1621         *
   1622         * CR handling
   1623         *
   1624         * Every state needs to handle CR unless the CR gets reconsumed and is
   1625         * handled by the reconsuming state. The CR needs to be handled as if it
   1626         * were and LF, the lastCR field must be set to true and then this
   1627         * method must return. The IO driver will then swallow the next
   1628         * character if it is an LF to coalesce CRLF.
   1629         */
   1630        stateloop: for (;;) {
   1631            switch (state) {
   1632                case DATA:
   1633                    dataloop: for (;;) {
   1634                        if (reconsume) {
   1635                            reconsume = false;
   1636                        } else {
   1637                            ++pos;
   1638                            // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today.
   1639                            // The line below advances pos by some number of code units that this state is indifferent to.
   1640                            // CPPONLY: pos += accelerateAdvancementData(buf, pos, endPos);
   1641                            if (pos == endPos) {
   1642                                break stateloop;
   1643                            }
   1644                            c = checkChar(buf, pos);
   1645                        }
   1646                        switch (c) {
   1647                            case '&':
   1648                                /*
   1649                                 * U+0026 AMPERSAND (&) Switch to the character
   1650                                 * reference in data state.
   1651                                 */
   1652                                flushChars(buf, pos);
   1653                                assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
   1654                                appendCharRefBuf(c);
   1655                                setAdditionalAndRememberAmpersandLocation('\u0000');
   1656                                returnState = state;
   1657                                state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
   1658                                continue stateloop;
   1659                            case '<':
   1660                                /*
   1661                                 * U+003C LESS-THAN SIGN (<) Switch to the tag
   1662                                 * open state.
   1663                                 */
   1664                                flushChars(buf, pos);
   1665 
   1666                                state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
   1667                                // `break` optimizes; `continue stateloop;` would be valid
   1668                                break dataloop;
   1669                            case '\u0000':
   1670                                maybeEmitReplacementCharacter(buf, pos);
   1671                                continue;
   1672                            case '\r':
   1673                                emitCarriageReturn(buf, pos);
   1674                                break stateloop;
   1675                            case '\n':
   1676                                silentLineFeed();
   1677                                // CPPONLY: MOZ_FALLTHROUGH;
   1678                            default:
   1679                                /*
   1680                                 * Anything else Emit the input character as a
   1681                                 * character token.
   1682                                 *
   1683                                 * Stay in the data state.
   1684                                 */
   1685                                continue;
   1686                        }
   1687                    }
   1688                    // CPPONLY: MOZ_FALLTHROUGH;
   1689                case TAG_OPEN:
   1690                    tagopenloop: for (;;) {
   1691                        /*
   1692                         * The behavior of this state depends on the content
   1693                         * model flag.
   1694                         */
   1695                        if (++pos == endPos) {
   1696                            break stateloop;
   1697                        }
   1698                        c = checkChar(buf, pos);
   1699                        /*
   1700                         * If the content model flag is set to the PCDATA state
   1701                         * Consume the next input character:
   1702                         */
   1703                        if (c >= 'A' && c <= 'Z') {
   1704                            /*
   1705                             * U+0041 LATIN CAPITAL LETTER A through to U+005A
   1706                             * LATIN CAPITAL LETTER Z Create a new start tag
   1707                             * token,
   1708                             */
   1709                            endTag = false;
   1710                            /*
   1711                             * set its tag name to the lowercase version of the
   1712                             * input character (add 0x0020 to the character's
   1713                             * code point),
   1714                             */
   1715                            clearStrBufBeforeUse();
   1716                            appendStrBuf((char) (c + 0x20));
   1717                            containsHyphen = false;
   1718                            /* then switch to the tag name state. */
   1719                            state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
   1720                            /*
   1721                             * (Don't emit the token yet; further details will
   1722                             * be filled in before it is emitted.)
   1723                             */
   1724                            // `break` optimizes; `continue stateloop;` would be valid
   1725                            break tagopenloop;
   1726                        } else if (c >= 'a' && c <= 'z') {
   1727                            /*
   1728                             * U+0061 LATIN SMALL LETTER A through to U+007A
   1729                             * LATIN SMALL LETTER Z Create a new start tag
   1730                             * token,
   1731                             */
   1732                            endTag = false;
   1733                            /*
   1734                             * set its tag name to the input character,
   1735                             */
   1736                            clearStrBufBeforeUse();
   1737                            appendStrBuf(c);
   1738                            containsHyphen = false;
   1739                            /* then switch to the tag name state. */
   1740                            state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
   1741                            /*
   1742                             * (Don't emit the token yet; further details will
   1743                             * be filled in before it is emitted.)
   1744                             */
   1745                            // `break` optimizes; `continue stateloop;` would be valid
   1746                            break tagopenloop;
   1747                        }
   1748                        switch (c) {
   1749                            case '!':
   1750                                /*
   1751                                 * U+0021 EXCLAMATION MARK (!) Switch to the
   1752                                 * markup declaration open state.
   1753                                 */
   1754                                state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
   1755                                continue stateloop;
   1756                            case '/':
   1757                                /*
   1758                                 * U+002F SOLIDUS (/) Switch to the close tag
   1759                                 * open state.
   1760                                 */
   1761                                state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
   1762                                continue stateloop;
   1763                            case '?':
   1764                                // CPPONLY: if (viewingXmlSource) {
   1765                                // CPPONLY: state = transition(state,
   1766                                // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
   1767                                // CPPONLY: reconsume,
   1768                                // CPPONLY: pos);
   1769                                // CPPONLY: continue stateloop;
   1770                                // CPPONLY: }
   1771                                /*
   1772                                 * U+003F QUESTION MARK (?) Parse error.
   1773                                 */
   1774                                errProcessingInstruction();
   1775                                /*
   1776                                 * Switch to the bogus comment state.
   1777                                 */
   1778                                clearStrBufBeforeUse();
   1779                                appendStrBuf(c);
   1780                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
   1781                                continue stateloop;
   1782                            case '>':
   1783                                /*
   1784                                 * U+003E GREATER-THAN SIGN (>) Parse error.
   1785                                 */
   1786                                errLtGt();
   1787                                /*
   1788                                 * Emit a U+003C LESS-THAN SIGN character token
   1789                                 * and a U+003E GREATER-THAN SIGN character
   1790                                 * token.
   1791                                 */
   1792                                tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
   1793                                /* Switch to the data state. */
   1794                                cstart = pos + 1;
   1795                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   1796                                continue stateloop;
   1797                            default:
   1798                                /*
   1799                                 * Anything else Parse error.
   1800                                 */
   1801                                errBadCharAfterLt(c);
   1802                                /*
   1803                                 * Emit a U+003C LESS-THAN SIGN character token
   1804                                 */
   1805                                tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
   1806                                /*
   1807                                 * and reconsume the current input character in
   1808                                 * the data state.
   1809                                 */
   1810                                cstart = pos;
   1811                                reconsume = true;
   1812                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   1813                                continue stateloop;
   1814                        }
   1815                    }
   1816                    // CPPONLY: MOZ_FALLTHROUGH;
   1817                case TAG_NAME:
   1818                    tagnameloop: for (;;) {
   1819                        if (++pos == endPos) {
   1820                            break stateloop;
   1821                        }
   1822                        c = checkChar(buf, pos);
   1823                        /*
   1824                         * Consume the next input character:
   1825                         */
   1826                        switch (c) {
   1827                            case '\r':
   1828                                silentCarriageReturn();
   1829                                strBufToElementNameString();
   1830                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
   1831                                break stateloop;
   1832                            case '\n':
   1833                                silentLineFeed();
   1834                                // CPPONLY: MOZ_FALLTHROUGH;
   1835                            case ' ':
   1836                            case '\t':
   1837                            case '\u000C':
   1838                                /*
   1839                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   1840                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
   1841                                 * Switch to the before attribute name state.
   1842                                 */
   1843                                strBufToElementNameString();
   1844                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
   1845                                // `break` optimizes; `continue stateloop;` would be valid
   1846                                break tagnameloop;
   1847                            case '/':
   1848                                /*
   1849                                 * U+002F SOLIDUS (/) Switch to the self-closing
   1850                                 * start tag state.
   1851                                 */
   1852                                strBufToElementNameString();
   1853                                state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
   1854                                continue stateloop;
   1855                            case '>':
   1856                                /*
   1857                                 * U+003E GREATER-THAN SIGN (>) Emit the current
   1858                                 * tag token.
   1859                                 */
   1860                                strBufToElementNameString();
   1861                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
   1862                                if (shouldSuspend) {
   1863                                    break stateloop;
   1864                                }
   1865                                /*
   1866                                 * Switch to the data state.
   1867                                 */
   1868                                continue stateloop;
   1869                            case '\u0000':
   1870                                c = '\uFFFD';
   1871                                // CPPONLY: MOZ_FALLTHROUGH;
   1872                            default:
   1873                                if (c >= 'A' && c <= 'Z') {
   1874                                    /*
   1875                                     * U+0041 LATIN CAPITAL LETTER A through to
   1876                                     * U+005A LATIN CAPITAL LETTER Z Append the
   1877                                     * lowercase version of the current input
   1878                                     * character (add 0x0020 to the character's
   1879                                     * code point) to the current tag token's
   1880                                     * tag name.
   1881                                     */
   1882                                    c += 0x20;
   1883                                } else if (c == '-') {
   1884                                    containsHyphen = true;
   1885                                }
   1886                                /*
   1887                                 * Anything else Append the current input
   1888                                 * character to the current tag token's tag
   1889                                 * name.
   1890                                 */
   1891                                appendStrBuf(c);
   1892                                /*
   1893                                 * Stay in the tag name state.
   1894                                 */
   1895                                continue;
   1896                        }
   1897                    }
   1898                    // CPPONLY: MOZ_FALLTHROUGH;
   1899                case BEFORE_ATTRIBUTE_NAME:
   1900                    beforeattributenameloop: for (;;) {
   1901                        if (reconsume) {
   1902                            reconsume = false;
   1903                        } else {
   1904                            if (++pos == endPos) {
   1905                                break stateloop;
   1906                            }
   1907                            c = checkChar(buf, pos);
   1908                        }
   1909                        /*
   1910                         * Consume the next input character:
   1911                         */
   1912                        switch (c) {
   1913                            case '\r':
   1914                                silentCarriageReturn();
   1915                                break stateloop;
   1916                            case '\n':
   1917                                silentLineFeed();
   1918                                // CPPONLY: MOZ_FALLTHROUGH;
   1919                            case ' ':
   1920                            case '\t':
   1921                            case '\u000C':
   1922                                /*
   1923                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   1924                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
   1925                                 * in the before attribute name state.
   1926                                 */
   1927                                continue;
   1928                            case '/':
   1929                                /*
   1930                                 * U+002F SOLIDUS (/) Switch to the self-closing
   1931                                 * start tag state.
   1932                                 */
   1933                                state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
   1934                                continue stateloop;
   1935                            case '>':
   1936                                /*
   1937                                 * U+003E GREATER-THAN SIGN (>) Emit the current
   1938                                 * tag token.
   1939                                 */
   1940                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
   1941                                if (shouldSuspend) {
   1942                                    break stateloop;
   1943                                }
   1944                                /*
   1945                                 * Switch to the data state.
   1946                                 */
   1947                                continue stateloop;
   1948                            case '\u0000':
   1949                                c = '\uFFFD';
   1950                                // CPPONLY: MOZ_FALLTHROUGH;
   1951                            case '\"':
   1952                            case '\'':
   1953                            case '<':
   1954                            case '=':
   1955                                /*
   1956                                 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
   1957                                 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
   1958                                 * SIGN (=) Parse error.
   1959                                 */
   1960                                errBadCharBeforeAttributeNameOrNull(c);
   1961                                /*
   1962                                 * Treat it as per the "anything else" entry
   1963                                 * below.
   1964                                 */
   1965                                // CPPONLY: MOZ_FALLTHROUGH;
   1966                            default:
   1967                                /*
   1968                                 * Anything else Start a new attribute in the
   1969                                 * current tag token.
   1970                                 */
   1971                                if (c >= 'A' && c <= 'Z') {
   1972                                    /*
   1973                                     * U+0041 LATIN CAPITAL LETTER A through to
   1974                                     * U+005A LATIN CAPITAL LETTER Z Set that
   1975                                     * attribute's name to the lowercase version
   1976                                     * of the current input character (add
   1977                                     * 0x0020 to the character's code point)
   1978                                     */
   1979                                    c += 0x20;
   1980                                }
   1981                                // CPPONLY: attributeLine = line;
   1982                                /*
   1983                                 * Set that attribute's name to the current
   1984                                 * input character,
   1985                                 */
   1986                                clearStrBufBeforeUse();
   1987                                appendStrBuf(c);
   1988                                /*
   1989                                 * and its value to the empty string.
   1990                                 */
   1991                                // Will do later.
   1992                                /*
   1993                                 * Switch to the attribute name state.
   1994                                 */
   1995                                state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
   1996                                // `break` optimizes; `continue stateloop;` would be valid
   1997                                break beforeattributenameloop;
   1998                        }
   1999                    }
   2000                    // CPPONLY: MOZ_FALLTHROUGH;
   2001                case ATTRIBUTE_NAME:
   2002                    attributenameloop: for (;;) {
   2003                        if (++pos == endPos) {
   2004                            break stateloop;
   2005                        }
   2006                        c = checkChar(buf, pos);
   2007                        /*
   2008                         * Consume the next input character:
   2009                         */
   2010                        switch (c) {
   2011                            case '\r':
   2012                                silentCarriageReturn();
   2013                                attributeNameComplete();
   2014                                state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
   2015                                break stateloop;
   2016                            case '\n':
   2017                                silentLineFeed();
   2018                                // CPPONLY: MOZ_FALLTHROUGH;
   2019                            case ' ':
   2020                            case '\t':
   2021                            case '\u000C':
   2022                                /*
   2023                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   2024                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
   2025                                 * Switch to the after attribute name state.
   2026                                 */
   2027                                attributeNameComplete();
   2028                                state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
   2029                                continue stateloop;
   2030                            case '/':
   2031                                /*
   2032                                 * U+002F SOLIDUS (/) Switch to the self-closing
   2033                                 * start tag state.
   2034                                 */
   2035                                attributeNameComplete();
   2036                                addAttributeWithoutValue();
   2037                                state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
   2038                                continue stateloop;
   2039                            case '=':
   2040                                /*
   2041                                 * U+003D EQUALS SIGN (=) Switch to the before
   2042                                 * attribute value state.
   2043                                 */
   2044                                attributeNameComplete();
   2045                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
   2046                                // `break` optimizes; `continue stateloop;` would be valid
   2047                                break attributenameloop;
   2048                            case '>':
   2049                                /*
   2050                                 * U+003E GREATER-THAN SIGN (>) Emit the current
   2051                                 * tag token.
   2052                                 */
   2053                                attributeNameComplete();
   2054                                addAttributeWithoutValue();
   2055                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
   2056                                if (shouldSuspend) {
   2057                                    break stateloop;
   2058                                }
   2059                                /*
   2060                                 * Switch to the data state.
   2061                                 */
   2062                                continue stateloop;
   2063                            case '\u0000':
   2064                                c = '\uFFFD';
   2065                                // CPPONLY: MOZ_FALLTHROUGH;
   2066                            case '\"':
   2067                            case '\'':
   2068                            case '<':
   2069                                /*
   2070                                 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
   2071                                 * (') U+003C LESS-THAN SIGN (<) Parse error.
   2072                                 */
   2073                                errQuoteOrLtInAttributeNameOrNull(c);
   2074                                /*
   2075                                 * Treat it as per the "anything else" entry
   2076                                 * below.
   2077                                 */
   2078                                // CPPONLY: MOZ_FALLTHROUGH;
   2079                            default:
   2080                                if (c >= 'A' && c <= 'Z') {
   2081                                    /*
   2082                                     * U+0041 LATIN CAPITAL LETTER A through to
   2083                                     * U+005A LATIN CAPITAL LETTER Z Append the
   2084                                     * lowercase version of the current input
   2085                                     * character (add 0x0020 to the character's
   2086                                     * code point) to the current attribute's
   2087                                     * name.
   2088                                     */
   2089                                    c += 0x20;
   2090                                }
   2091                                /*
   2092                                 * Anything else Append the current input
   2093                                 * character to the current attribute's name.
   2094                                 */
   2095                                appendStrBuf(c);
   2096                                /*
   2097                                 * Stay in the attribute name state.
   2098                                 */
   2099                                continue;
   2100                        }
   2101                    }
   2102                    // CPPONLY: MOZ_FALLTHROUGH;
   2103                case BEFORE_ATTRIBUTE_VALUE:
   2104                    beforeattributevalueloop: for (;;) {
   2105                        if (++pos == endPos) {
   2106                            break stateloop;
   2107                        }
   2108                        c = checkChar(buf, pos);
   2109                        /*
   2110                         * Consume the next input character:
   2111                         */
   2112                        switch (c) {
   2113                            case '\r':
   2114                                silentCarriageReturn();
   2115                                break stateloop;
   2116                            case '\n':
   2117                                silentLineFeed();
   2118                                // CPPONLY: MOZ_FALLTHROUGH;
   2119                            case ' ':
   2120                            case '\t':
   2121                            case '\u000C':
   2122                                /*
   2123                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   2124                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
   2125                                 * in the before attribute value state.
   2126                                 */
   2127                                continue;
   2128                            case '"':
   2129                                /*
   2130                                 * U+0022 QUOTATION MARK (") Switch to the
   2131                                 * attribute value (double-quoted) state.
   2132                                 */
   2133                                // CPPONLY: attributeLine = line;
   2134                                clearStrBufBeforeUse();
   2135                                state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
   2136                                // `break` optimizes; `continue stateloop;` would be valid
   2137                                break beforeattributevalueloop;
   2138                            case '&':
   2139                                /*
   2140                                 * U+0026 AMPERSAND (&) Switch to the attribute
   2141                                 * value (unquoted) state and reconsume this
   2142                                 * input character.
   2143                                 */
   2144                                // CPPONLY: attributeLine = line;
   2145                                clearStrBufBeforeUse();
   2146                                reconsume = true;
   2147                                state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
   2148                                noteUnquotedAttributeValue();
   2149                                continue stateloop;
   2150                            case '\'':
   2151                                /*
   2152                                 * U+0027 APOSTROPHE (') Switch to the attribute
   2153                                 * value (single-quoted) state.
   2154                                 */
   2155                                // CPPONLY: attributeLine = line;
   2156                                clearStrBufBeforeUse();
   2157                                state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
   2158                                continue stateloop;
   2159                            case '>':
   2160                                /*
   2161                                 * U+003E GREATER-THAN SIGN (>) Parse error.
   2162                                 */
   2163                                errAttributeValueMissing();
   2164                                /*
   2165                                 * Emit the current tag token.
   2166                                 */
   2167                                addAttributeWithoutValue();
   2168                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
   2169                                if (shouldSuspend) {
   2170                                    break stateloop;
   2171                                }
   2172                                /*
   2173                                 * Switch to the data state.
   2174                                 */
   2175                                continue stateloop;
   2176                            case '\u0000':
   2177                                c = '\uFFFD';
   2178                                // CPPONLY: MOZ_FALLTHROUGH;
   2179                            case '<':
   2180                            case '=':
   2181                            case '`':
   2182                                /*
   2183                                 * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
   2184                                 * (=) U+0060 GRAVE ACCENT (`)
   2185                                 */
   2186                                errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
   2187                                /*
   2188                                 * Treat it as per the "anything else" entry
   2189                                 * below.
   2190                                 */
   2191                                // CPPONLY: MOZ_FALLTHROUGH;
   2192                            default:
   2193                                /*
   2194                                 * Anything else Append the current input
   2195                                 * character to the current attribute's value.
   2196                                 */
   2197                                // CPPONLY: attributeLine = line;
   2198                                clearStrBufBeforeUse();
   2199                                appendStrBuf(c);
   2200                                /*
   2201                                 * Switch to the attribute value (unquoted)
   2202                                 * state.
   2203                                 */
   2204 
   2205                                state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
   2206                                noteUnquotedAttributeValue();
   2207                                continue stateloop;
   2208                        }
   2209                    }
   2210                    // CPPONLY: MOZ_FALLTHROUGH;
   2211                case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
   2212                    attributevaluedoublequotedloop: for (;;) {
   2213                        if (reconsume) {
   2214                            reconsume = false;
   2215                        } else {
   2216                            ++pos;
   2217                            // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today.
   2218                            // The line below advances pos by some number of code units that this state is indifferent to.
   2219                            // CPPONLY: pos += accelerateAdvancementAttributeValueDoubleQuoted(buf, pos, endPos);
   2220                            if (pos == endPos) {
   2221                                break stateloop;
   2222                            }
   2223                            c = checkChar(buf, pos);
   2224                        }
   2225                        /*
   2226                         * Consume the next input character:
   2227                         */
   2228                        switch (c) {
   2229                            case '"':
   2230                                /*
   2231                                 * U+0022 QUOTATION MARK (") Switch to the after
   2232                                 * attribute value (quoted) state.
   2233                                 */
   2234                                addAttributeWithValue();
   2235 
   2236                                state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
   2237                                // `break` optimizes; `continue stateloop;` would be valid
   2238                                break attributevaluedoublequotedloop;
   2239                            case '&':
   2240                                /*
   2241                                 * U+0026 AMPERSAND (&) Switch to the character
   2242                                 * reference in attribute value state, with the
   2243                                 * additional allowed character being U+0022
   2244                                 * QUOTATION MARK (").
   2245                                 */
   2246                                assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
   2247                                appendCharRefBuf(c);
   2248                                setAdditionalAndRememberAmpersandLocation('\"');
   2249                                returnState = state;
   2250                                state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
   2251                                continue stateloop;
   2252                            case '\r':
   2253                                appendStrBufCarriageReturn();
   2254                                break stateloop;
   2255                            case '\n':
   2256                                appendStrBufLineFeed();
   2257                                continue;
   2258                            case '\u0000':
   2259                                c = '\uFFFD';
   2260                                // CPPONLY: MOZ_FALLTHROUGH;
   2261                            default:
   2262                                /*
   2263                                 * Anything else Append the current input
   2264                                 * character to the current attribute's value.
   2265                                 */
   2266                                appendStrBuf(c);
   2267                                /*
   2268                                 * Stay in the attribute value (double-quoted)
   2269                                 * state.
   2270                                 */
   2271                                continue;
   2272                        }
   2273                    }
   2274                    // CPPONLY: MOZ_FALLTHROUGH;
   2275                case AFTER_ATTRIBUTE_VALUE_QUOTED:
   2276                    afterattributevaluequotedloop: for (;;) {
   2277                        if (++pos == endPos) {
   2278                            break stateloop;
   2279                        }
   2280                        c = checkChar(buf, pos);
   2281                        /*
   2282                         * Consume the next input character:
   2283                         */
   2284                        switch (c) {
   2285                            case '\r':
   2286                                silentCarriageReturn();
   2287                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
   2288                                break stateloop;
   2289                            case '\n':
   2290                                silentLineFeed();
   2291                                // CPPONLY: MOZ_FALLTHROUGH;
   2292                            case ' ':
   2293                            case '\t':
   2294                            case '\u000C':
   2295                                /*
   2296                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   2297                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
   2298                                 * Switch to the before attribute name state.
   2299                                 */
   2300                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
   2301                                continue stateloop;
   2302                            case '/':
   2303                                /*
   2304                                 * U+002F SOLIDUS (/) Switch to the self-closing
   2305                                 * start tag state.
   2306                                 */
   2307                                state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
   2308                                // `break` optimizes; `continue stateloop;` would be valid
   2309                                break afterattributevaluequotedloop;
   2310                            case '>':
   2311                                /*
   2312                                 * U+003E GREATER-THAN SIGN (>) Emit the current
   2313                                 * tag token.
   2314                                 */
   2315                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
   2316                                if (shouldSuspend) {
   2317                                    break stateloop;
   2318                                }
   2319                                /*
   2320                                 * Switch to the data state.
   2321                                 */
   2322                                continue stateloop;
   2323                            default:
   2324                                /*
   2325                                 * Anything else Parse error.
   2326                                 */
   2327                                errNoSpaceBetweenAttributes();
   2328                                /*
   2329                                 * Reconsume the character in the before
   2330                                 * attribute name state.
   2331                                 */
   2332                                reconsume = true;
   2333                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
   2334                                continue stateloop;
   2335                        }
   2336                    }
   2337                    // CPPONLY: MOZ_FALLTHROUGH;
   2338                case SELF_CLOSING_START_TAG:
   2339                    if (++pos == endPos) {
   2340                        break stateloop;
   2341                    }
   2342                    c = checkChar(buf, pos);
   2343                    /*
   2344                     * Consume the next input character:
   2345                     */
   2346                    switch (c) {
   2347                        case '>':
   2348                            /*
   2349                             * U+003E GREATER-THAN SIGN (>) Set the self-closing
   2350                             * flag of the current tag token. Emit the current
   2351                             * tag token.
   2352                             */
   2353                            state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
   2354                            if (shouldSuspend) {
   2355                                break stateloop;
   2356                            }
   2357                            /*
   2358                             * Switch to the data state.
   2359                             */
   2360                            continue stateloop;
   2361                        default:
   2362                            /* Anything else Parse error. */
   2363                            errSlashNotFollowedByGt();
   2364                            /*
   2365                             * Reconsume the character in the before attribute
   2366                             * name state.
   2367                             */
   2368                            reconsume = true;
   2369                            state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
   2370                            continue stateloop;
   2371                    }
   2372                    // no fallthrough, reordering opportunity
   2373                case ATTRIBUTE_VALUE_UNQUOTED:
   2374                    for (;;) {
   2375                        if (reconsume) {
   2376                            reconsume = false;
   2377                        } else {
   2378                            if (++pos == endPos) {
   2379                                break stateloop;
   2380                            }
   2381                            c = checkChar(buf, pos);
   2382                        }
   2383                        /*
   2384                         * Consume the next input character:
   2385                         */
   2386                        switch (c) {
   2387                            case '\r':
   2388                                silentCarriageReturn();
   2389                                addAttributeWithValue();
   2390                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
   2391                                break stateloop;
   2392                            case '\n':
   2393                                silentLineFeed();
   2394                                // CPPONLY: MOZ_FALLTHROUGH;
   2395                            case ' ':
   2396                            case '\t':
   2397                            case '\u000C':
   2398                                /*
   2399                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   2400                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
   2401                                 * Switch to the before attribute name state.
   2402                                 */
   2403                                addAttributeWithValue();
   2404                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
   2405                                continue stateloop;
   2406                            case '&':
   2407                                /*
   2408                                 * U+0026 AMPERSAND (&) Switch to the character
   2409                                 * reference in attribute value state, with the
   2410                                 * additional allowed character being U+003E
   2411                                 * GREATER-THAN SIGN (>)
   2412                                 */
   2413                                assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
   2414                                appendCharRefBuf(c);
   2415                                setAdditionalAndRememberAmpersandLocation('>');
   2416                                returnState = state;
   2417                                state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
   2418                                continue stateloop;
   2419                            case '>':
   2420                                /*
   2421                                 * U+003E GREATER-THAN SIGN (>) Emit the current
   2422                                 * tag token.
   2423                                 */
   2424                                addAttributeWithValue();
   2425                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
   2426                                if (shouldSuspend) {
   2427                                    break stateloop;
   2428                                }
   2429                                /*
   2430                                 * Switch to the data state.
   2431                                 */
   2432                                continue stateloop;
   2433                            case '\u0000':
   2434                                c = '\uFFFD';
   2435                                // CPPONLY: MOZ_FALLTHROUGH;
   2436                            case '<':
   2437                            case '\"':
   2438                            case '\'':
   2439                            case '=':
   2440                            case '`':
   2441                                /*
   2442                                 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
   2443                                 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
   2444                                 * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
   2445                                 */
   2446                                errUnquotedAttributeValOrNull(c);
   2447                                /*
   2448                                 * Treat it as per the "anything else" entry
   2449                                 * below.
   2450                                 */
   2451                                // CPPONLY: MOZ_FALLTHROUGH;
   2452                            default:
   2453                                /*
   2454                                 * Anything else Append the current input
   2455                                 * character to the current attribute's value.
   2456                                 */
   2457                                appendStrBuf(c);
   2458                                /*
   2459                                 * Stay in the attribute value (unquoted) state.
   2460                                 */
   2461                                continue;
   2462                        }
   2463                    }
   2464                    // no fallthrough, reordering opportunity
   2465               case AFTER_ATTRIBUTE_NAME:
   2466                    for (;;) {
   2467                        if (++pos == endPos) {
   2468                            break stateloop;
   2469                        }
   2470                        c = checkChar(buf, pos);
   2471                        /*
   2472                         * Consume the next input character:
   2473                         */
   2474                        switch (c) {
   2475                            case '\r':
   2476                                silentCarriageReturn();
   2477                                break stateloop;
   2478                            case '\n':
   2479                                silentLineFeed();
   2480                                // CPPONLY: MOZ_FALLTHROUGH;
   2481                            case ' ':
   2482                            case '\t':
   2483                            case '\u000C':
   2484                                /*
   2485                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   2486                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
   2487                                 * in the after attribute name state.
   2488                                 */
   2489                                continue;
   2490                            case '/':
   2491                                /*
   2492                                 * U+002F SOLIDUS (/) Switch to the self-closing
   2493                                 * start tag state.
   2494                                 */
   2495                                addAttributeWithoutValue();
   2496                                state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
   2497                                continue stateloop;
   2498                            case '=':
   2499                                /*
   2500                                 * U+003D EQUALS SIGN (=) Switch to the before
   2501                                 * attribute value state.
   2502                                 */
   2503                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
   2504                                continue stateloop;
   2505                            case '>':
   2506                                /*
   2507                                 * U+003E GREATER-THAN SIGN (>) Emit the current
   2508                                 * tag token.
   2509                                 */
   2510                                addAttributeWithoutValue();
   2511                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
   2512                                if (shouldSuspend) {
   2513                                    break stateloop;
   2514                                }
   2515                                /*
   2516                                 * Switch to the data state.
   2517                                 */
   2518                                continue stateloop;
   2519                            case '\u0000':
   2520                                c = '\uFFFD';
   2521                                // CPPONLY: MOZ_FALLTHROUGH;
   2522                            case '\"':
   2523                            case '\'':
   2524                            case '<':
   2525                                errQuoteOrLtInAttributeNameOrNull(c);
   2526                                /*
   2527                                 * Treat it as per the "anything else" entry
   2528                                 * below.
   2529                                 */
   2530                                // CPPONLY: MOZ_FALLTHROUGH;
   2531                            default:
   2532                                addAttributeWithoutValue();
   2533                                /*
   2534                                 * Anything else Start a new attribute in the
   2535                                 * current tag token.
   2536                                 */
   2537                                if (c >= 'A' && c <= 'Z') {
   2538                                    /*
   2539                                     * U+0041 LATIN CAPITAL LETTER A through to
   2540                                     * U+005A LATIN CAPITAL LETTER Z Set that
   2541                                     * attribute's name to the lowercase version
   2542                                     * of the current input character (add
   2543                                     * 0x0020 to the character's code point)
   2544                                     */
   2545                                    c += 0x20;
   2546                                }
   2547                                /*
   2548                                 * Set that attribute's name to the current
   2549                                 * input character,
   2550                                 */
   2551                                clearStrBufBeforeUse();
   2552                                appendStrBuf(c);
   2553                                /*
   2554                                 * and its value to the empty string.
   2555                                 */
   2556                                // Will do later.
   2557                                /*
   2558                                 * Switch to the attribute name state.
   2559                                 */
   2560                                state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
   2561                                continue stateloop;
   2562                        }
   2563                    }
   2564                    // no fallthrough, reordering opportunity
   2565                case MARKUP_DECLARATION_OPEN:
   2566                    markupdeclarationopenloop: for (;;) {
   2567                        if (++pos == endPos) {
   2568                            break stateloop;
   2569                        }
   2570                        c = checkChar(buf, pos);
   2571                        /*
   2572                         * If the next two characters are both U+002D
   2573                         * HYPHEN-MINUS characters (-), consume those two
   2574                         * characters, create a comment token whose data is the
   2575                         * empty string, and switch to the comment start state.
   2576                         *
   2577                         * Otherwise, if the next seven characters are an ASCII
   2578                         * case-insensitive match for the word "DOCTYPE", then
   2579                         * consume those characters and switch to the DOCTYPE
   2580                         * state.
   2581                         *
   2582                         * Otherwise, if the insertion mode is
   2583                         * "in foreign content" and the current node is not an
   2584                         * element in the HTML namespace and the next seven
   2585                         * characters are an case-sensitive match for the string
   2586                         * "[CDATA[" (the five uppercase letters "CDATA" with a
   2587                         * U+005B LEFT SQUARE BRACKET character before and
   2588                         * after), then consume those characters and switch to
   2589                         * the CDATA section state.
   2590                         *
   2591                         * Otherwise, is is a parse error. Switch to the bogus
   2592                         * comment state. The next character that is consumed,
   2593                         * if any, is the first character that will be in the
   2594                         * comment.
   2595                         */
   2596                        switch (c) {
   2597                            case '-':
   2598                                clearStrBufBeforeUse();
   2599                                appendStrBuf(c);
   2600                                state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
   2601                                // `break` optimizes; `continue stateloop;` would be valid
   2602                                break markupdeclarationopenloop;
   2603                            case 'd':
   2604                            case 'D':
   2605                                clearStrBufBeforeUse();
   2606                                appendStrBuf(c);
   2607                                index = 0;
   2608                                state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
   2609                                continue stateloop;
   2610                            case '[':
   2611                                if (tokenHandler.cdataSectionAllowed()) {
   2612                                    clearStrBufBeforeUse();
   2613                                    appendStrBuf(c);
   2614                                    index = 0;
   2615                                    state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
   2616                                    continue stateloop;
   2617                                }
   2618                                // CPPONLY: MOZ_FALLTHROUGH;
   2619                            default:
   2620                                errBogusComment();
   2621                                clearStrBufBeforeUse();
   2622                                reconsume = true;
   2623                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
   2624                                continue stateloop;
   2625                        }
   2626                    }
   2627                    // CPPONLY: MOZ_FALLTHROUGH;
   2628                case MARKUP_DECLARATION_HYPHEN:
   2629                    markupdeclarationhyphenloop: for (;;) {
   2630                        if (++pos == endPos) {
   2631                            break stateloop;
   2632                        }
   2633                        c = checkChar(buf, pos);
   2634                        switch (c) {
   2635                            case '-':
   2636                                clearStrBufAfterOneHyphen();
   2637                                state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
   2638                                // `break` optimizes; `continue stateloop;` would be valid
   2639                                break markupdeclarationhyphenloop;
   2640                            default:
   2641                                errBogusComment();
   2642                                reconsume = true;
   2643                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
   2644                                continue stateloop;
   2645                        }
   2646                    }
   2647                    // CPPONLY: MOZ_FALLTHROUGH;
   2648                case COMMENT_START:
   2649                    reportedConsecutiveHyphens = false;
   2650                    commentstartloop: for (;;) {
   2651                        if (++pos == endPos) {
   2652                            break stateloop;
   2653                        }
   2654                        c = checkChar(buf, pos);
   2655                        /*
   2656                         * Comment start state
   2657                         *
   2658                         *
   2659                         * Consume the next input character:
   2660                         */
   2661                        switch (c) {
   2662                            case '-':
   2663                                /*
   2664                                 * U+002D HYPHEN-MINUS (-) Switch to the comment
   2665                                 * start dash state.
   2666                                 */
   2667                                appendStrBuf(c);
   2668                                state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
   2669                                continue stateloop;
   2670                            case '>':
   2671                                /*
   2672                                 * U+003E GREATER-THAN SIGN (>) Parse error.
   2673                                 */
   2674                                errPrematureEndOfComment();
   2675                                /* Emit the comment token. */
   2676                                emitComment(0, pos);
   2677                                /*
   2678                                 * Switch to the data state.
   2679                                 */
   2680                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   2681                                if (shouldSuspend) {
   2682                                    break stateloop;
   2683                                }
   2684                                continue stateloop;
   2685                            case '<':
   2686                                appendStrBuf(c);
   2687                                state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
   2688                                continue stateloop;
   2689                            case '\r':
   2690                                appendStrBufCarriageReturn();
   2691                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2692                                break stateloop;
   2693                            case '\n':
   2694                                appendStrBufLineFeed();
   2695                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2696                                break commentstartloop;
   2697                            case '\u0000':
   2698                                c = '\uFFFD';
   2699                                // CPPONLY: MOZ_FALLTHROUGH;
   2700                            default:
   2701                                /*
   2702                                 * Anything else Append the input character to
   2703                                 * the comment token's data.
   2704                                 */
   2705                                appendStrBuf(c);
   2706                                /*
   2707                                 * Switch to the comment state.
   2708                                 */
   2709                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2710                                // `break` optimizes; `continue stateloop;` would be valid
   2711                                break commentstartloop;
   2712                        }
   2713                    }
   2714                    // CPPONLY: MOZ_FALLTHROUGH;
   2715                case COMMENT:
   2716                    commentloop: for (;;) {
   2717                        ++pos;
   2718                        // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today.
   2719                        // The line below advances pos by some number of code units that this state is indifferent to.
   2720                        // CPPONLY: pos += accelerateAdvancementComment(buf, pos, endPos);
   2721                        if (pos == endPos) {
   2722                            break stateloop;
   2723                        }
   2724                        c = checkChar(buf, pos);
   2725                        /*
   2726                         * Comment state Consume the next input character:
   2727                         */
   2728                        switch (c) {
   2729                            case '-':
   2730                                /*
   2731                                 * U+002D HYPHEN-MINUS (-) Switch to the comment
   2732                                 * end dash state
   2733                                 */
   2734                                appendStrBuf(c);
   2735                                state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
   2736                                // `break` optimizes; `continue stateloop;` would be valid
   2737                                break commentloop;
   2738                            case '<':
   2739                                appendStrBuf(c);
   2740                                state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
   2741                                continue stateloop;
   2742                            case '\r':
   2743                                appendStrBufCarriageReturn();
   2744                                break stateloop;
   2745                            case '\n':
   2746                                appendStrBufLineFeed();
   2747                                continue;
   2748                            case '\u0000':
   2749                                c = '\uFFFD';
   2750                                // CPPONLY: MOZ_FALLTHROUGH;
   2751                            default:
   2752                                /*
   2753                                 * Anything else Append the input character to
   2754                                 * the comment token's data.
   2755                                 */
   2756                                appendStrBuf(c);
   2757                                /*
   2758                                 * Stay in the comment state.
   2759                                 */
   2760                                continue;
   2761                        }
   2762                    }
   2763                    // CPPONLY: MOZ_FALLTHROUGH;
   2764                case COMMENT_END_DASH:
   2765                    commentenddashloop: for (;;) {
   2766                        if (++pos == endPos) {
   2767                            break stateloop;
   2768                        }
   2769                        c = checkChar(buf, pos);
   2770                        /*
   2771                         * Comment end dash state Consume the next input
   2772                         * character:
   2773                         */
   2774                        switch (c) {
   2775                            case '-':
   2776                                /*
   2777                                 * U+002D HYPHEN-MINUS (-) Switch to the comment
   2778                                 * end state
   2779                                 */
   2780                                appendStrBuf(c);
   2781                                state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
   2782                                // `break` optimizes; `continue stateloop;` would be valid
   2783                                break commentenddashloop;
   2784                            case '<':
   2785                                appendStrBuf(c);
   2786                                state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
   2787                                continue stateloop;
   2788                            case '\r':
   2789                                appendStrBufCarriageReturn();
   2790                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2791                                break stateloop;
   2792                            case '\n':
   2793                                appendStrBufLineFeed();
   2794                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2795                                continue stateloop;
   2796                            case '\u0000':
   2797                                c = '\uFFFD';
   2798                                // CPPONLY: MOZ_FALLTHROUGH;
   2799                            default:
   2800                                /*
   2801                                 * Anything else Append a U+002D HYPHEN-MINUS
   2802                                 * (-) character and the input character to the
   2803                                 * comment token's data.
   2804                                 */
   2805                                appendStrBuf(c);
   2806                                /*
   2807                                 * Switch to the comment state.
   2808                                 */
   2809                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2810                                continue stateloop;
   2811                        }
   2812                    }
   2813                    // CPPONLY: MOZ_FALLTHROUGH;
   2814                case COMMENT_END:
   2815                    commentendloop: for (;;) {
   2816                        if (++pos == endPos) {
   2817                            break stateloop;
   2818                        }
   2819                        c = checkChar(buf, pos);
   2820                        /*
   2821                         * Comment end dash state Consume the next input
   2822                         * character:
   2823                         */
   2824                        switch (c) {
   2825                            case '>':
   2826                                /*
   2827                                 * U+003E GREATER-THAN SIGN (>) Emit the comment
   2828                                 * token.
   2829                                 */
   2830                                emitComment(2, pos);
   2831                                /*
   2832                                 * Switch to the data state.
   2833                                 */
   2834                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   2835                                if (shouldSuspend) {
   2836                                    break stateloop;
   2837                                }
   2838                                continue stateloop;
   2839                            case '-':
   2840                                /* U+002D HYPHEN-MINUS (-) Parse error. */
   2841                                /*
   2842                                 * Append a U+002D HYPHEN-MINUS (-) character to
   2843                                 * the comment token's data.
   2844                                 */
   2845                                adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
   2846                                reportedConsecutiveHyphens = true;
   2847                                /*
   2848                                 * Stay in the comment end state.
   2849                                 */
   2850                                continue;
   2851                            case '<':
   2852                                appendStrBuf(c);
   2853                                state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
   2854                                continue stateloop;
   2855                            case '\r':
   2856                                adjustDoubleHyphenAndAppendToStrBufCarriageReturn();
   2857                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2858                                break stateloop;
   2859                            case '\n':
   2860                                adjustDoubleHyphenAndAppendToStrBufLineFeed();
   2861                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2862                                continue stateloop;
   2863                            case '!':
   2864                                appendStrBuf(c);
   2865                                state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
   2866                                // `break` optimizes; `continue stateloop;` would be valid
   2867                                break commentendloop;
   2868                            case '\u0000':
   2869                                c = '\uFFFD';
   2870                                // CPPONLY: MOZ_FALLTHROUGH;
   2871                            default:
   2872                                /*
   2873                                 * Append two U+002D HYPHEN-MINUS (-) characters
   2874                                 * and the input character to the comment
   2875                                 * token's data.
   2876                                 */
   2877                                adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
   2878                                reportedConsecutiveHyphens = true;
   2879                                /*
   2880                                 * Switch to the comment state.
   2881                                 */
   2882                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2883                                continue stateloop;
   2884                        }
   2885                    }
   2886                    // CPPONLY: MOZ_FALLTHROUGH;
   2887                case COMMENT_END_BANG:
   2888                    for (;;) {
   2889                        if (++pos == endPos) {
   2890                            break stateloop;
   2891                        }
   2892                        c = checkChar(buf, pos);
   2893                        /*
   2894                         * Comment end bang state
   2895                         *
   2896                         * Consume the next input character:
   2897                         */
   2898                        switch (c) {
   2899                            case '>':
   2900                                /*
   2901                                 * U+003E GREATER-THAN SIGN (>) Emit the comment
   2902                                 * token.
   2903                                 */
   2904                                emitComment(3, pos);
   2905                                /*
   2906                                 * Switch to the data state.
   2907                                 */
   2908                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   2909                                if (shouldSuspend) {
   2910                                    break stateloop;
   2911                                }
   2912                                continue stateloop;
   2913                            case '-':
   2914                                /*
   2915                                 * Append two U+002D HYPHEN-MINUS (-) characters
   2916                                 * and a U+0021 EXCLAMATION MARK (!) character
   2917                                 * to the comment token's data.
   2918                                 */
   2919                                appendStrBuf(c);
   2920                                /*
   2921                                 * Switch to the comment end dash state.
   2922                                 */
   2923                                state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
   2924                                continue stateloop;
   2925                            case '\r':
   2926                                appendStrBufCarriageReturn();
   2927                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2928                                break stateloop;
   2929                            case '\n':
   2930                                appendStrBufLineFeed();
   2931                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2932                                continue stateloop;
   2933                            case '\u0000':
   2934                                c = '\uFFFD';
   2935                                // CPPONLY: MOZ_FALLTHROUGH;
   2936                            default:
   2937                                /*
   2938                                 * Anything else Append two U+002D HYPHEN-MINUS
   2939                                 * (-) characters, a U+0021 EXCLAMATION MARK (!)
   2940                                 * character, and the input character to the
   2941                                 * comment token's data. Switch to the comment
   2942                                 * state.
   2943                                 */
   2944                                appendStrBuf(c);
   2945                                /*
   2946                                 * Switch to the comment state.
   2947                                 */
   2948                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2949                                continue stateloop;
   2950                        }
   2951                    }
   2952                    // no fallthrough, reordering opportunity
   2953                case COMMENT_LESSTHAN:
   2954                    commentlessthanloop: for (;;) {
   2955                        if (++pos == endPos) {
   2956                            break stateloop;
   2957                        }
   2958                        c = checkChar(buf, pos);
   2959                        switch (c) {
   2960                            case '!':
   2961                                appendStrBuf(c);
   2962                                state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG, reconsume, pos);
   2963                                // `break` optimizes; `continue stateloop;` would be valid
   2964                                break commentlessthanloop;
   2965                            case '<':
   2966                                appendStrBuf(c);
   2967                                continue;
   2968                            case '-':
   2969                                appendStrBuf(c);
   2970                                state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
   2971                                continue stateloop;
   2972                            case '\r':
   2973                                appendStrBufCarriageReturn();
   2974                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2975                                break stateloop;
   2976                            case '\n':
   2977                                appendStrBufLineFeed();
   2978                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2979                                continue stateloop;
   2980                            case '\u0000':
   2981                                c = '\uFFFD';
   2982                                // CPPONLY: MOZ_FALLTHROUGH;
   2983                            default:
   2984                                appendStrBuf(c);
   2985                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   2986                                continue stateloop;
   2987                        }
   2988                    }
   2989                    // CPPONLY: MOZ_FALLTHROUGH;
   2990                case COMMENT_LESSTHAN_BANG:
   2991                    commentlessthanbangloop: for (;;) {
   2992                        if (++pos == endPos) {
   2993                            break stateloop;
   2994                        }
   2995                        c = checkChar(buf, pos);
   2996                        switch (c) {
   2997                            case '-':
   2998                                appendStrBuf(c);
   2999                                state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG_DASH, reconsume, pos);
   3000                                // `break` optimizes; `continue stateloop;` would be valid
   3001                                break commentlessthanbangloop;
   3002                            case '<':
   3003                                appendStrBuf(c);
   3004                                state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
   3005                                continue stateloop;
   3006                            case '\r':
   3007                                appendStrBufCarriageReturn();
   3008                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   3009                                break stateloop;
   3010                            case '\n':
   3011                                appendStrBufLineFeed();
   3012                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   3013                                continue stateloop;
   3014                            case '\u0000':
   3015                                c = '\uFFFD';
   3016                                // CPPONLY: MOZ_FALLTHROUGH;
   3017                            default:
   3018                                appendStrBuf(c);
   3019                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   3020                                continue stateloop;
   3021                        }
   3022                    }
   3023                    // CPPONLY: MOZ_FALLTHROUGH;
   3024                case COMMENT_LESSTHAN_BANG_DASH:
   3025                    if (++pos == endPos) {
   3026                        break stateloop;
   3027                    }
   3028                    c = checkChar(buf, pos);
   3029                    switch (c) {
   3030                        case '-':
   3031                            appendStrBuf(c);
   3032                            state = transition(state,
   3033                                    Tokenizer.COMMENT_LESSTHAN_BANG_DASH_DASH,
   3034                                    reconsume, pos);
   3035                            // `break` optimizes; `continue stateloop;` would be valid
   3036                            break;
   3037                        case '<':
   3038                            appendStrBuf(c);
   3039                            state = transition(state,
   3040                                    Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
   3041                            continue stateloop;
   3042                        case '\r':
   3043                            appendStrBufCarriageReturn();
   3044                            state = transition(state, Tokenizer.COMMENT,
   3045                                    reconsume, pos);
   3046                            break stateloop;
   3047                        case '\n':
   3048                            appendStrBufLineFeed();
   3049                            state = transition(state, Tokenizer.COMMENT,
   3050                                    reconsume, pos);
   3051                            continue stateloop;
   3052                        case '\u0000':
   3053                            c = '\uFFFD';
   3054                            // CPPONLY: MOZ_FALLTHROUGH;
   3055                        default:
   3056                            appendStrBuf(c);
   3057                            state = transition(state, Tokenizer.COMMENT,
   3058                                    reconsume, pos);
   3059                            continue stateloop;
   3060                    }
   3061                    // CPPONLY: MOZ_FALLTHROUGH;
   3062                case COMMENT_LESSTHAN_BANG_DASH_DASH:
   3063                    if (++pos == endPos) {
   3064                        break stateloop;
   3065                    }
   3066                    c = checkChar(buf, pos);
   3067                    switch (c) {
   3068                        case '>':
   3069                            appendStrBuf(c);
   3070                            emitComment(3, pos);
   3071                            state = transition(state, Tokenizer.DATA, reconsume,
   3072                                    pos);
   3073                            if (shouldSuspend) {
   3074                                break stateloop;
   3075                            }
   3076                            continue stateloop;
   3077                        case '-':
   3078                            errNestedComment();
   3079                            adjustDoubleHyphenAndAppendToStrBufAndErr(c,
   3080                                    reportedConsecutiveHyphens);
   3081                            reportedConsecutiveHyphens = true;
   3082                            state = transition(state, Tokenizer.COMMENT_END,
   3083                                    reconsume, pos);
   3084                            continue stateloop;
   3085                        case '\r':
   3086                            c = '\n';
   3087                            silentCarriageReturn();
   3088                            errNestedComment();
   3089                            adjustDoubleHyphenAndAppendToStrBufAndErr(c,
   3090                                    reportedConsecutiveHyphens);
   3091                            reportedConsecutiveHyphens = true;
   3092                            state = transition(state, Tokenizer.COMMENT,
   3093                                    reconsume, pos);
   3094                            break stateloop;
   3095                        case '\n':
   3096                            silentLineFeed();
   3097                            errNestedComment();
   3098                            adjustDoubleHyphenAndAppendToStrBufAndErr(c,
   3099                                    reportedConsecutiveHyphens);
   3100                            reportedConsecutiveHyphens = true;
   3101                            state = transition(state, Tokenizer.COMMENT,
   3102                                    reconsume, pos);
   3103                            continue stateloop;
   3104                        case '!':
   3105                            errNestedComment();
   3106                            adjustDoubleHyphenAndAppendToStrBufAndErr(c,
   3107                                    reportedConsecutiveHyphens);
   3108                            reportedConsecutiveHyphens = true;
   3109                            state = transition(state,
   3110                                    Tokenizer.COMMENT_END_BANG, reconsume, pos);
   3111                            continue stateloop;
   3112                        case '\u0000':
   3113                            c = '\uFFFD';
   3114                            // CPPONLY: MOZ_FALLTHROUGH;
   3115                        default:
   3116                            errNestedComment();
   3117                            adjustDoubleHyphenAndAppendToStrBufAndErr(c,
   3118                                    reportedConsecutiveHyphens);
   3119                            reportedConsecutiveHyphens = true;
   3120                            state = transition(state, Tokenizer.COMMENT,
   3121                                    reconsume, pos);
   3122                            continue stateloop;
   3123                    }
   3124                    // no fallthrough, reordering opportunity
   3125                case COMMENT_START_DASH:
   3126                    if (++pos == endPos) {
   3127                        break stateloop;
   3128                    }
   3129                    c = checkChar(buf, pos);
   3130                    /*
   3131                     * Comment start dash state
   3132                     *
   3133                     * Consume the next input character:
   3134                     */
   3135                    switch (c) {
   3136                        case '-':
   3137                            /*
   3138                             * U+002D HYPHEN-MINUS (-) Switch to the comment end
   3139                             * state
   3140                             */
   3141                            appendStrBuf(c);
   3142                            state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
   3143                            continue stateloop;
   3144                        case '>':
   3145                            errPrematureEndOfComment();
   3146                            /* Emit the comment token. */
   3147                            emitComment(1, pos);
   3148                            /*
   3149                             * Switch to the data state.
   3150                             */
   3151                            state = transition(state, Tokenizer.DATA, reconsume, pos);
   3152                            if (shouldSuspend) {
   3153                                break stateloop;
   3154                            }
   3155                            continue stateloop;
   3156                        case '<':
   3157                            appendStrBuf(c);
   3158                            state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
   3159                            continue stateloop;
   3160                        case '\r':
   3161                            appendStrBufCarriageReturn();
   3162                            state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   3163                            break stateloop;
   3164                        case '\n':
   3165                            appendStrBufLineFeed();
   3166                            state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   3167                            continue stateloop;
   3168                        case '\u0000':
   3169                            c = '\uFFFD';
   3170                            // CPPONLY: MOZ_FALLTHROUGH;
   3171                        default:
   3172                            /*
   3173                             * Append a U+002D HYPHEN-MINUS character (-) and
   3174                             * the current input character to the comment
   3175                             * token's data.
   3176                             */
   3177                            appendStrBuf(c);
   3178                            /*
   3179                             * Switch to the comment state.
   3180                             */
   3181                            state = transition(state, Tokenizer.COMMENT, reconsume, pos);
   3182                            continue stateloop;
   3183                    }
   3184                    // no fallthrough, reordering opportunity
   3185                case CDATA_START:
   3186                    for (;;) {
   3187                        if (++pos == endPos) {
   3188                            break stateloop;
   3189                        }
   3190                        c = checkChar(buf, pos);
   3191                        if (index < 6) { // CDATA_LSQB.length
   3192                            if (c == Tokenizer.CDATA_LSQB[index]) {
   3193                                appendStrBuf(c);
   3194                            } else {
   3195                                errBogusComment();
   3196                                reconsume = true;
   3197                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
   3198                                continue stateloop;
   3199                            }
   3200                            index++;
   3201                            continue;
   3202                        } else {
   3203                            clearStrBufAfterUse();
   3204                            cstart = pos; // start coalescing
   3205                            reconsume = true;
   3206                            state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
   3207                            // `break` optimizes; `continue stateloop;` would be valid
   3208                            break;
   3209                        }
   3210                    }
   3211                    // CPPONLY: MOZ_FALLTHROUGH;
   3212                case CDATA_SECTION:
   3213                    cdatasectionloop: for (;;) {
   3214                        if (reconsume) {
   3215                            reconsume = false;
   3216                        } else {
   3217                            ++pos;
   3218                            // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today.
   3219                            // The line below advances pos by some number of code units that this state is indifferent to.
   3220                            // CPPONLY: pos += accelerateAdvancementCdataSection(buf, pos, endPos);
   3221                            if (pos == endPos) {
   3222                                break stateloop;
   3223                            }
   3224                            c = checkChar(buf, pos);
   3225                        }
   3226                        switch (c) {
   3227                            case ']':
   3228                                flushChars(buf, pos);
   3229                                state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
   3230                                // `break` optimizes; `continue stateloop;` would be valid
   3231                                break cdatasectionloop;
   3232                            case '\u0000':
   3233                                maybeEmitReplacementCharacter(buf, pos);
   3234                                continue;
   3235                            case '\r':
   3236                                emitCarriageReturn(buf, pos);
   3237                                break stateloop;
   3238                            case '\n':
   3239                                silentLineFeed();
   3240                                // CPPONLY: MOZ_FALLTHROUGH;
   3241                            default:
   3242                                continue;
   3243                        }
   3244                    }
   3245                    // CPPONLY: MOZ_FALLTHROUGH;
   3246                case CDATA_RSQB:
   3247                    if (++pos == endPos) {
   3248                        break stateloop;
   3249                    }
   3250                    c = checkChar(buf, pos);
   3251                    switch (c) {
   3252                        case ']':
   3253                            state = transition(state, Tokenizer.CDATA_RSQB_RSQB,
   3254                                    reconsume, pos);
   3255                            // `break` optimizes; `continue stateloop;` would be valid
   3256                            break;
   3257                        default:
   3258                            tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
   3259                            cstart = pos;
   3260                            reconsume = true;
   3261                            state = transition(state, Tokenizer.CDATA_SECTION,
   3262                                    reconsume, pos);
   3263                            continue stateloop;
   3264                    }
   3265                    // CPPONLY: MOZ_FALLTHROUGH;
   3266                case CDATA_RSQB_RSQB:
   3267                    cdatarsqbrsqb: for (;;) {
   3268                        if (++pos == endPos) {
   3269                            break stateloop;
   3270                        }
   3271                        c = checkChar(buf, pos);
   3272                        switch (c) {
   3273                            case ']':
   3274                                // Saw a third ]. Emit one ] (logically the
   3275                                // first one) and stay in this state to
   3276                                // remember that the last two characters seen
   3277                                // have been ]].
   3278                                tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
   3279                                continue;
   3280                            case '>':
   3281                                cstart = pos + 1;
   3282                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   3283                                // Since a CDATA section starts with a less-than sign, it
   3284                                // participates in the suspension-after-current-token
   3285                                // behavior. (The suspension can be requested when the
   3286                                // less-than sign has been seen but we don't yet know the
   3287                                // resulting token type.) Therefore, we need to deal with
   3288                                // a potential request here.
   3289                                suspendIfRequestedAfterCurrentNonTextToken();
   3290                                if (shouldSuspend) {
   3291                                    break stateloop;
   3292                                }
   3293                                continue stateloop;
   3294                            default:
   3295                                tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
   3296                                cstart = pos;
   3297                                reconsume = true;
   3298                                state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
   3299                                continue stateloop;
   3300                        }
   3301                    }
   3302                    // no fallthrough, reordering opportunity
   3303                case ATTRIBUTE_VALUE_SINGLE_QUOTED:
   3304                    attributevaluesinglequotedloop: for (;;) {
   3305                        if (reconsume) {
   3306                            reconsume = false;
   3307                        } else {
   3308                            ++pos;
   3309                            // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today.
   3310                            // The line below advances pos by some number of code units that this state is indifferent to.
   3311                            // CPPONLY: pos += accelerateAdvancementAttributeValueSingleQuoted(buf, pos, endPos);
   3312                            if (pos == endPos) {
   3313                                break stateloop;
   3314                            }
   3315                            c = checkChar(buf, pos);
   3316                        }
   3317                        /*
   3318                         * Consume the next input character:
   3319                         */
   3320                        switch (c) {
   3321                            case '\'':
   3322                                /*
   3323                                 * U+0027 APOSTROPHE (') Switch to the after
   3324                                 * attribute value (quoted) state.
   3325                                 */
   3326                                addAttributeWithValue();
   3327 
   3328                                state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
   3329                                continue stateloop;
   3330                            case '&':
   3331                                /*
   3332                                 * U+0026 AMPERSAND (&) Switch to the character
   3333                                 * reference in attribute value state, with the
   3334                                 * + additional allowed character being U+0027
   3335                                 * APOSTROPHE (').
   3336                                 */
   3337                                assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
   3338                                appendCharRefBuf(c);
   3339                                setAdditionalAndRememberAmpersandLocation('\'');
   3340                                returnState = state;
   3341                                state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
   3342                                // `break` optimizes; `continue stateloop;` would be valid
   3343                                break attributevaluesinglequotedloop;
   3344                            case '\r':
   3345                                appendStrBufCarriageReturn();
   3346                                break stateloop;
   3347                            case '\n':
   3348                                appendStrBufLineFeed();
   3349                                continue;
   3350                            case '\u0000':
   3351                                c = '\uFFFD';
   3352                                // CPPONLY: MOZ_FALLTHROUGH;
   3353                            default:
   3354                                /*
   3355                                 * Anything else Append the current input
   3356                                 * character to the current attribute's value.
   3357                                 */
   3358                                appendStrBuf(c);
   3359                                /*
   3360                                 * Stay in the attribute value (double-quoted)
   3361                                 * state.
   3362                                 */
   3363                                continue;
   3364                        }
   3365                    }
   3366                    // CPPONLY: MOZ_FALLTHROUGH;
   3367                case CONSUME_CHARACTER_REFERENCE:
   3368                    if (++pos == endPos) {
   3369                        break stateloop;
   3370                    }
   3371                    c = checkChar(buf, pos);
   3372                    /*
   3373                     * Unlike the definition is the spec, this state does not
   3374                     * return a value and never requires the caller to
   3375                     * backtrack. This state takes care of emitting characters
   3376                     * or appending to the current attribute value. It also
   3377                     * takes care of that in the case when consuming the
   3378                     * character reference fails.
   3379                     */
   3380                    /*
   3381                     * This section defines how to consume a character
   3382                     * reference. This definition is used when parsing character
   3383                     * references in text and in attributes.
   3384                     *
   3385                     * The behavior depends on the identity of the next
   3386                     * character (the one immediately after the U+0026 AMPERSAND
   3387                     * character):
   3388                     */
   3389                    switch (c) {
   3390                        case ' ':
   3391                        case '\t':
   3392                        case '\n':
   3393                        case '\r': // we'll reconsume!
   3394                        case '\u000C':
   3395                        case '<':
   3396                        case '&':
   3397                        case '\u0000':
   3398                        case ';':
   3399                            emitOrAppendCharRefBuf(returnState);
   3400                            if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3401                                cstart = pos;
   3402                            }
   3403                            reconsume = true;
   3404                            state = transition(state, returnState, reconsume, pos);
   3405                            continue stateloop;
   3406                        case '#':
   3407                            /*
   3408                             * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
   3409                             * SIGN.
   3410                             */
   3411                            appendCharRefBuf('#');
   3412                            state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
   3413                            continue stateloop;
   3414                        default:
   3415                            if (c == additional) {
   3416                                emitOrAppendCharRefBuf(returnState);
   3417                                reconsume = true;
   3418                                state = transition(state, returnState, reconsume, pos);
   3419                                continue stateloop;
   3420                            }
   3421                            if (c >= 'a' && c <= 'z') {
   3422                                firstCharKey = c - 'a' + 26;
   3423                            } else if (c >= 'A' && c <= 'Z') {
   3424                                firstCharKey = c - 'A';
   3425                            } else {
   3426                                // No match
   3427                                if (c == ';') {
   3428                                    errNoNamedCharacterMatch();
   3429                                }
   3430                                emitOrAppendCharRefBuf(returnState);
   3431                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3432                                    cstart = pos;
   3433                                }
   3434                                reconsume = true;
   3435                                state = transition(state, returnState, reconsume, pos);
   3436                                continue stateloop;
   3437                            }
   3438                            // Didn't fail yet
   3439                            appendCharRefBuf(c);
   3440                            state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
   3441                            // `break` optimizes; `continue stateloop;` would be valid
   3442                            break;
   3443                    }
   3444                    // CPPONLY: MOZ_FALLTHROUGH;
   3445                case CHARACTER_REFERENCE_HILO_LOOKUP:
   3446                    {
   3447                        if (++pos == endPos) {
   3448                            break stateloop;
   3449                        }
   3450                        c = checkChar(buf, pos);
   3451                        /*
   3452                         * The data structure is as follows:
   3453                         *
   3454                         * HILO_ACCEL is a two-dimensional int array whose major
   3455                         * index corresponds to the second character of the
   3456                         * character reference (code point as index) and the
   3457                         * minor index corresponds to the first character of the
   3458                         * character reference (packed so that A-Z runs from 0
   3459                         * to 25 and a-z runs from 26 to 51). This layout makes
   3460                         * it easier to use the sparseness of the data structure
   3461                         * to omit parts of it: The second dimension of the
   3462                         * table is null when no character reference starts with
   3463                         * the character corresponding to that row.
   3464                         *
   3465                         * The int value HILO_ACCEL (by these indeces) is zero
   3466                         * if there exists no character reference starting with
   3467                         * that two-letter prefix. Otherwise, the value is an
   3468                         * int that packs two shorts so that the higher short is
   3469                         * the index of the highest character reference name
   3470                         * with that prefix in NAMES and the lower short
   3471                         * corresponds to the index of the lowest character
   3472                         * reference name with that prefix. (It happens that the
   3473                         * first two character reference names share their
   3474                         * prefix so the packed int cannot be 0 by packing the
   3475                         * two shorts.)
   3476                         *
   3477                         * NAMES is an array of byte arrays where each byte
   3478                         * array encodes the name of a character references as
   3479                         * ASCII. The names omit the first two letters of the
   3480                         * name. (Since storing the first two letters would be
   3481                         * redundant with the data contained in HILO_ACCEL.) The
   3482                         * entries are lexically sorted.
   3483                         *
   3484                         * For a given index in NAMES, the same index in VALUES
   3485                         * contains the corresponding expansion as an array of
   3486                         * two UTF-16 code units (either the character and
   3487                         * U+0000 or a suggogate pair).
   3488                         */
   3489                        int hilo = 0;
   3490                        if (c <= 'z') {
   3491                            @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
   3492                            if (row != null) {
   3493                                hilo = row[firstCharKey];
   3494                            }
   3495                        }
   3496                        if (hilo == 0) {
   3497                            if (c == ';') {
   3498                                errNoNamedCharacterMatch();
   3499                            }
   3500                            emitOrAppendCharRefBuf(returnState);
   3501                            if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3502                                cstart = pos;
   3503                            }
   3504                            reconsume = true;
   3505                            state = transition(state, returnState, reconsume, pos);
   3506                            continue stateloop;
   3507                        }
   3508                        // Didn't fail yet
   3509                        appendCharRefBuf(c);
   3510                        lo = hilo & 0xFFFF;
   3511                        hi = hilo >> 16;
   3512                        entCol = -1;
   3513                        candidate = -1;
   3514                        charRefBufMark = 0;
   3515                        state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
   3516                        // fallthrough optimizes; `continue stateloop;` would also be valid
   3517                    }
   3518                    // CPPONLY: MOZ_FALLTHROUGH;
   3519                case CHARACTER_REFERENCE_TAIL:
   3520                    outer: for (;;) {
   3521                        if (++pos == endPos) {
   3522                            break stateloop;
   3523                        }
   3524                        c = checkChar(buf, pos);
   3525                        entCol++;
   3526                        /*
   3527                         * Consume the maximum number of characters possible,
   3528                         * with the consumed characters matching one of the
   3529                         * identifiers in the first column of the named
   3530                         * character references table (in a case-sensitive
   3531                         * manner).
   3532                         */
   3533                        loloop: for (;;) {
   3534                            if (hi < lo) {
   3535                                break outer;
   3536                            }
   3537                            if (entCol == NamedCharacters.NAMES[lo].length()) {
   3538                                candidate = lo;
   3539                                charRefBufMark = charRefBufLen;
   3540                                lo++;
   3541                            } else if (entCol > NamedCharacters.NAMES[lo].length()) {
   3542                                break outer;
   3543                            } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
   3544                                lo++;
   3545                            } else {
   3546                                break loloop;
   3547                            }
   3548                        }
   3549 
   3550                        hiloop: for (;;) {
   3551                            if (hi < lo) {
   3552                                break outer;
   3553                            }
   3554                            if (entCol == NamedCharacters.NAMES[hi].length()) {
   3555                                break hiloop;
   3556                            }
   3557                            if (entCol > NamedCharacters.NAMES[hi].length()) {
   3558                                break outer;
   3559                            } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
   3560                                hi--;
   3561                            } else {
   3562                                break hiloop;
   3563                            }
   3564                        }
   3565 
   3566                        if (c == ';') {
   3567                            // If we see a semicolon, there cannot be a
   3568                            // longer match. Break the loop. However, before
   3569                            // breaking, take the longest match so far as the
   3570                            // candidate, if we are just about to complete a
   3571                            // match.
   3572                            if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
   3573                                candidate = lo;
   3574                                charRefBufMark = charRefBufLen;
   3575                            }
   3576                            break outer;
   3577                        }
   3578 
   3579                        if (hi < lo) {
   3580                            break outer;
   3581                        }
   3582                        appendCharRefBuf(c);
   3583                        continue;
   3584                    }
   3585 
   3586                    if (candidate == -1) {
   3587                        // reconsume deals with CR, LF or nul
   3588                        if (c == ';') {
   3589                            errNoNamedCharacterMatch();
   3590                        }
   3591                        emitOrAppendCharRefBuf(returnState);
   3592                        if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3593                            cstart = pos;
   3594                        }
   3595                        reconsume = true;
   3596                        state = transition(state, returnState, reconsume, pos);
   3597                        continue stateloop;
   3598                    } else {
   3599                        // c can't be CR, LF or nul if we got here
   3600                        @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
   3601                        if (candidateName.length() == 0
   3602                                || candidateName.charAt(candidateName.length() - 1) != ';') {
   3603                            /*
   3604                             * If the last character matched is not a U+003B
   3605                             * SEMICOLON (;), there is a parse error.
   3606                             */
   3607                            if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
   3608                                /*
   3609                                 * If the entity is being consumed as part of an
   3610                                 * attribute, and the last character matched is
   3611                                 * not a U+003B SEMICOLON (;),
   3612                                 */
   3613                                char ch;
   3614                                if (charRefBufMark == charRefBufLen) {
   3615                                    ch = c;
   3616                                } else {
   3617                                    ch = charRefBuf[charRefBufMark];
   3618                                }
   3619                                if (ch == '=' || (ch >= '0' && ch <= '9')
   3620                                        || (ch >= 'A' && ch <= 'Z')
   3621                                        || (ch >= 'a' && ch <= 'z')) {
   3622                                    /*
   3623                                     * and the next character is either a U+003D
   3624                                     * EQUALS SIGN character (=) or in the range
   3625                                     * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
   3626                                     * U+0041 LATIN CAPITAL LETTER A to U+005A
   3627                                     * LATIN CAPITAL LETTER Z, or U+0061 LATIN
   3628                                     * SMALL LETTER A to U+007A LATIN SMALL
   3629                                     * LETTER Z, then, for historical reasons,
   3630                                     * all the characters that were matched
   3631                                     * after the U+0026 AMPERSAND (&) must be
   3632                                     * unconsumed, and nothing is returned.
   3633                                     */
   3634                                    if (c == ';') {
   3635                                        errNoNamedCharacterMatch();
   3636                                    }
   3637                                    appendCharRefBufToStrBuf();
   3638                                    reconsume = true;
   3639                                    state = transition(state, returnState, reconsume, pos);
   3640                                    continue stateloop;
   3641                                }
   3642                            }
   3643                            if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
   3644                                errUnescapedAmpersandInterpretedAsCharacterReference();
   3645                            } else {
   3646                                errNotSemicolonTerminated();
   3647                            }
   3648                        }
   3649 
   3650                        /*
   3651                         * Otherwise, return a character token for the character
   3652                         * corresponding to the entity name (as given by the
   3653                         * second column of the named character references
   3654                         * table).
   3655                         */
   3656                        // CPPONLY: completedNamedCharacterReference();
   3657                        @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
   3658                        if (
   3659                        // [NOCPP[
   3660                        val.length == 1
   3661                        // ]NOCPP]
   3662                        // CPPONLY: val[1] == 0
   3663                        ) {
   3664                            emitOrAppendOne(val, returnState);
   3665                        } else {
   3666                            emitOrAppendTwo(val, returnState);
   3667                        }
   3668                        // this is so complicated!
   3669                        if (charRefBufMark < charRefBufLen) {
   3670                            if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
   3671                                appendStrBuf(charRefBuf, charRefBufMark,
   3672                                        charRefBufLen - charRefBufMark);
   3673                            } else {
   3674                                tokenHandler.characters(charRefBuf, charRefBufMark,
   3675                                        charRefBufLen - charRefBufMark);
   3676                            }
   3677                        }
   3678                        // charRefBufLen will be zeroed below!
   3679 
   3680                        // Check if we broke out early with c being the last
   3681                        // character that matched as opposed to being the
   3682                        // first one that didn't match. In the case of an
   3683                        // early break, the next run on text should start
   3684                        // *after* the current character and the current
   3685                        // character shouldn't be reconsumed.
   3686                        boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen);
   3687                        charRefBufLen = 0;
   3688                        if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3689                            cstart = earlyBreak ? pos + 1 : pos;
   3690                        }
   3691                        reconsume = !earlyBreak;
   3692                        state = transition(state, returnState, reconsume, pos);
   3693                        continue stateloop;
   3694                        /*
   3695                         * If the markup contains I'm &notit; I tell you, the
   3696                         * entity is parsed as "not", as in, I'm ¬it; I tell
   3697                         * you. But if the markup was I'm &notin; I tell you,
   3698                         * the entity would be parsed as "notin;", resulting in
   3699                         * I'm ∉ I tell you.
   3700                         */
   3701                    }
   3702                    // no fallthrough, reordering opportunity
   3703                case CONSUME_NCR:
   3704                    if (++pos == endPos) {
   3705                        break stateloop;
   3706                    }
   3707                    c = checkChar(buf, pos);
   3708                    value = 0;
   3709                    seenDigits = false;
   3710                    /*
   3711                     * The behavior further depends on the character after the
   3712                     * U+0023 NUMBER SIGN:
   3713                     */
   3714                    switch (c) {
   3715                        case 'x':
   3716                        case 'X':
   3717 
   3718                            /*
   3719                             * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
   3720                             * LETTER X Consume the X.
   3721                             *
   3722                             * Follow the steps below, but using the range of
   3723                             * characters U+0030 DIGIT ZERO through to U+0039
   3724                             * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
   3725                             * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
   3726                             * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
   3727                             * LETTER F (in other words, 0-9, A-F, a-f).
   3728                             *
   3729                             * When it comes to interpreting the number,
   3730                             * interpret it as a hexadecimal number.
   3731                             */
   3732                            appendCharRefBuf(c);
   3733                            state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
   3734                            continue stateloop;
   3735                        default:
   3736                            /*
   3737                             * Anything else Follow the steps below, but using
   3738                             * the range of characters U+0030 DIGIT ZERO through
   3739                             * to U+0039 DIGIT NINE (i.e. just 0-9).
   3740                             *
   3741                             * When it comes to interpreting the number,
   3742                             * interpret it as a decimal number.
   3743                             */
   3744                            reconsume = true;
   3745                            state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
   3746                            // `break` optimizes; `continue stateloop;` would be valid
   3747                            break;
   3748                    }
   3749                    // CPPONLY: MOZ_FALLTHROUGH;
   3750                case DECIMAL_NRC_LOOP:
   3751                    decimalloop: for (;;) {
   3752                        if (reconsume) {
   3753                            reconsume = false;
   3754                        } else {
   3755                            if (++pos == endPos) {
   3756                                break stateloop;
   3757                            }
   3758                            c = checkChar(buf, pos);
   3759                        }
   3760                        /*
   3761                         * Consume as many characters as match the range of
   3762                         * characters given above.
   3763                         */
   3764                        assert value >= 0: "value must not become negative.";
   3765                        if (c >= '0' && c <= '9') {
   3766                            seenDigits = true;
   3767                            // Avoid overflow
   3768                            if (value <= 0x10FFFF) {
   3769                                value *= 10;
   3770                                value += c - '0';
   3771                            }
   3772                            continue;
   3773                        } else if (c == ';') {
   3774                            if (seenDigits) {
   3775                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3776                                    cstart = pos + 1;
   3777                                }
   3778                                state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
   3779                                // `break` optimizes; `continue stateloop;` would be valid
   3780                                break decimalloop;
   3781                            } else {
   3782                                errNoDigitsInNCR();
   3783                                appendCharRefBuf(';');
   3784                                emitOrAppendCharRefBuf(returnState);
   3785                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3786                                    cstart = pos + 1;
   3787                                }
   3788                                state = transition(state, returnState, reconsume, pos);
   3789                                continue stateloop;
   3790                            }
   3791                        } else {
   3792                            /*
   3793                             * If no characters match the range, then don't
   3794                             * consume any characters (and unconsume the U+0023
   3795                             * NUMBER SIGN character and, if appropriate, the X
   3796                             * character). This is a parse error; nothing is
   3797                             * returned.
   3798                             *
   3799                             * Otherwise, if the next character is a U+003B
   3800                             * SEMICOLON, consume that too. If it isn't, there
   3801                             * is a parse error.
   3802                             */
   3803                            if (!seenDigits) {
   3804                                errNoDigitsInNCR();
   3805                                emitOrAppendCharRefBuf(returnState);
   3806                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3807                                    cstart = pos;
   3808                                }
   3809                                reconsume = true;
   3810                                state = transition(state, returnState, reconsume, pos);
   3811                                continue stateloop;
   3812                            } else {
   3813                                errCharRefLacksSemicolon();
   3814                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3815                                    cstart = pos;
   3816                                }
   3817                                reconsume = true;
   3818                                state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
   3819                                // `break` optimizes; `continue stateloop;` would be valid
   3820                                break decimalloop;
   3821                            }
   3822                        }
   3823                    }
   3824                    // CPPONLY: MOZ_FALLTHROUGH;
   3825                case HANDLE_NCR_VALUE:
   3826                    // WARNING previous state sets reconsume
   3827                    // We are not going to emit the contents of charRefBuf.
   3828                    charRefBufLen = 0;
   3829                    // XXX inline this case if the method size can take it
   3830                    handleNcrValue(returnState);
   3831                    state = transition(state, returnState, reconsume, pos);
   3832                    continue stateloop;
   3833                    // no fallthrough, reordering opportunity
   3834                case HEX_NCR_LOOP:
   3835                    for (;;) {
   3836                        if (++pos == endPos) {
   3837                            break stateloop;
   3838                        }
   3839                        c = checkChar(buf, pos);
   3840                        /*
   3841                         * Consume as many characters as match the range of
   3842                         * characters given above.
   3843                         */
   3844                        assert value >= 0: "value must not become negative.";
   3845                        if (c >= '0' && c <= '9') {
   3846                            seenDigits = true;
   3847                            // Avoid overflow
   3848                            if (value <= 0x10FFFF) {
   3849                                value *= 16;
   3850                                value += c - '0';
   3851                            }
   3852                            continue;
   3853                        } else if (c >= 'A' && c <= 'F') {
   3854                            seenDigits = true;
   3855                            // Avoid overflow
   3856                            if (value <= 0x10FFFF) {
   3857                                value *= 16;
   3858                                value += c - 'A' + 10;
   3859                            }
   3860                            continue;
   3861                        } else if (c >= 'a' && c <= 'f') {
   3862                            seenDigits = true;
   3863                            // Avoid overflow
   3864                            if (value <= 0x10FFFF) {
   3865                                value *= 16;
   3866                                value += c - 'a' + 10;
   3867                            }
   3868                            continue;
   3869                        } else if (c == ';') {
   3870                            if (seenDigits) {
   3871                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3872                                    cstart = pos + 1;
   3873                                }
   3874                                state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
   3875                                continue stateloop;
   3876                            } else {
   3877                                errNoDigitsInNCR();
   3878                                appendCharRefBuf(';');
   3879                                emitOrAppendCharRefBuf(returnState);
   3880                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3881                                    cstart = pos + 1;
   3882                                }
   3883                                state = transition(state, returnState, reconsume, pos);
   3884                                continue stateloop;
   3885                            }
   3886                        } else {
   3887                            /*
   3888                             * If no characters match the range, then don't
   3889                             * consume any characters (and unconsume the U+0023
   3890                             * NUMBER SIGN character and, if appropriate, the X
   3891                             * character). This is a parse error; nothing is
   3892                             * returned.
   3893                             *
   3894                             * Otherwise, if the next character is a U+003B
   3895                             * SEMICOLON, consume that too. If it isn't, there
   3896                             * is a parse error.
   3897                             */
   3898                            if (!seenDigits) {
   3899                                errNoDigitsInNCR();
   3900                                emitOrAppendCharRefBuf(returnState);
   3901                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3902                                    cstart = pos;
   3903                                }
   3904                                reconsume = true;
   3905                                state = transition(state, returnState, reconsume, pos);
   3906                                continue stateloop;
   3907                            } else {
   3908                                errCharRefLacksSemicolon();
   3909                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
   3910                                    cstart = pos;
   3911                                }
   3912                                reconsume = true;
   3913                                state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
   3914                                continue stateloop;
   3915                            }
   3916                        }
   3917                    }
   3918                    // no fallthrough, reordering opportunity
   3919                case PLAINTEXT:
   3920                    plaintextloop: for (;;) {
   3921                        if (reconsume) {
   3922                            reconsume = false;
   3923                        } else {
   3924                            ++pos;
   3925                            // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today.
   3926                            // The line below advances pos by some number of code units that this state is indifferent to.
   3927                            // CPPONLY: pos += accelerateAdvancementPlaintext(buf, pos, endPos);
   3928                            if (pos == endPos) {
   3929                                break stateloop;
   3930                            }
   3931                            c = checkChar(buf, pos);
   3932                        }
   3933                        switch (c) {
   3934                            case '\u0000':
   3935                                emitPlaintextReplacementCharacter(buf, pos);
   3936                                continue;
   3937                            case '\r':
   3938                                emitCarriageReturn(buf, pos);
   3939                                break stateloop;
   3940                            case '\n':
   3941                                silentLineFeed();
   3942                                // CPPONLY: MOZ_FALLTHROUGH;
   3943                            default:
   3944                                /*
   3945                                 * Anything else Emit the current input
   3946                                 * character as a character token. Stay in the
   3947                                 * RAWTEXT state.
   3948                                 */
   3949                                continue;
   3950                        }
   3951                    }
   3952                    // no fallthrough, reordering opportunity
   3953                case CLOSE_TAG_OPEN:
   3954                    if (++pos == endPos) {
   3955                        break stateloop;
   3956                    }
   3957                    c = checkChar(buf, pos);
   3958                    /*
   3959                     * Otherwise, if the content model flag is set to the PCDATA
   3960                     * state, or if the next few characters do match that tag
   3961                     * name, consume the next input character:
   3962                     */
   3963                    switch (c) {
   3964                        case '>':
   3965                            /* U+003E GREATER-THAN SIGN (>) Parse error. */
   3966                            errLtSlashGt();
   3967                            /*
   3968                             * Switch to the data state.
   3969                             */
   3970                            cstart = pos + 1;
   3971                            state = transition(state, Tokenizer.DATA, reconsume, pos);
   3972                            continue stateloop;
   3973                        case '\r':
   3974                            silentCarriageReturn();
   3975                            /* Anything else Parse error. */
   3976                            errGarbageAfterLtSlash();
   3977                            /*
   3978                             * Switch to the bogus comment state.
   3979                             */
   3980                            clearStrBufBeforeUse();
   3981                            appendStrBuf('\n');
   3982                            state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
   3983                            break stateloop;
   3984                        case '\n':
   3985                            silentLineFeed();
   3986                            /* Anything else Parse error. */
   3987                            errGarbageAfterLtSlash();
   3988                            /*
   3989                             * Switch to the bogus comment state.
   3990                             */
   3991                            clearStrBufBeforeUse();
   3992                            appendStrBuf(c);
   3993                            state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
   3994                            continue stateloop;
   3995                        case '\u0000':
   3996                            c = '\uFFFD';
   3997                            // CPPONLY: MOZ_FALLTHROUGH;
   3998                        default:
   3999                            if (c >= 'A' && c <= 'Z') {
   4000                                c += 0x20;
   4001                            }
   4002                            if (c >= 'a' && c <= 'z') {
   4003                                /*
   4004                                 * U+0061 LATIN SMALL LETTER A through to U+007A
   4005                                 * LATIN SMALL LETTER Z Create a new end tag
   4006                                 * token,
   4007                                 */
   4008                                endTag = true;
   4009                                /*
   4010                                 * set its tag name to the input character,
   4011                                 */
   4012                                clearStrBufBeforeUse();
   4013                                appendStrBuf(c);
   4014                                containsHyphen = false;
   4015                                /*
   4016                                 * then switch to the tag name state. (Don't
   4017                                 * emit the token yet; further details will be
   4018                                 * filled in before it is emitted.)
   4019                                 */
   4020                                state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
   4021                                continue stateloop;
   4022                            } else {
   4023                                /* Anything else Parse error. */
   4024                                errGarbageAfterLtSlash();
   4025                                /*
   4026                                 * Switch to the bogus comment state.
   4027                                 */
   4028                                clearStrBufBeforeUse();
   4029                                appendStrBuf(c);
   4030                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
   4031                                continue stateloop;
   4032                            }
   4033                    }
   4034                    // no fallthrough, reordering opportunity
   4035                case RCDATA:
   4036                    rcdataloop: for (;;) {
   4037                        if (reconsume) {
   4038                            reconsume = false;
   4039                        } else {
   4040                            ++pos;
   4041                            // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today.
   4042                            // The line below advances pos by some number of code units that this state is indifferent to.
   4043                            // RCDATA and DATA have the same set of characters that they are indifferent to, hence accelerateData.
   4044                            // CPPONLY: pos += accelerateAdvancementData(buf, pos, endPos);
   4045                            if (pos == endPos) {
   4046                                break stateloop;
   4047                            }
   4048                            c = checkChar(buf, pos);
   4049                        }
   4050                        switch (c) {
   4051                            case '&':
   4052                                /*
   4053                                 * U+0026 AMPERSAND (&) Switch to the character
   4054                                 * reference in RCDATA state.
   4055                                 */
   4056                                flushChars(buf, pos);
   4057                                assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
   4058                                appendCharRefBuf(c);
   4059                                setAdditionalAndRememberAmpersandLocation('\u0000');
   4060                                returnState = state;
   4061                                state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
   4062                                continue stateloop;
   4063                            case '<':
   4064                                /*
   4065                                 * U+003C LESS-THAN SIGN (<) Switch to the
   4066                                 * RCDATA less-than sign state.
   4067                                 */
   4068                                flushChars(buf, pos);
   4069 
   4070                                returnState = state;
   4071                                state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
   4072                                continue stateloop;
   4073                            case '\u0000':
   4074                                emitReplacementCharacter(buf, pos);
   4075                                continue;
   4076                            case '\r':
   4077                                emitCarriageReturn(buf, pos);
   4078                                break stateloop;
   4079                            case '\n':
   4080                                silentLineFeed();
   4081                                // CPPONLY: MOZ_FALLTHROUGH;
   4082                            default:
   4083                                /*
   4084                                 * Emit the current input character as a
   4085                                 * character token. Stay in the RCDATA state.
   4086                                 */
   4087                                continue;
   4088                        }
   4089                    }
   4090                    // no fallthrough, reordering opportunity
   4091                case RAWTEXT:
   4092                    rawtextloop: for (;;) {
   4093                        if (reconsume) {
   4094                            reconsume = false;
   4095                        } else {
   4096                            ++pos;
   4097                            // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today.
   4098                            // The line below advances pos by some number of code units that this state is indifferent to.
   4099                            // CPPONLY: pos += accelerateAdvancementRawtext(buf, pos, endPos);
   4100                            if (pos == endPos) {
   4101                                break stateloop;
   4102                            }
   4103                            c = checkChar(buf, pos);
   4104                        }
   4105                        switch (c) {
   4106                            case '<':
   4107                                /*
   4108                                 * U+003C LESS-THAN SIGN (<) Switch to the
   4109                                 * RAWTEXT less-than sign state.
   4110                                 */
   4111                                flushChars(buf, pos);
   4112 
   4113                                returnState = state;
   4114                                state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
   4115                                // `break` optimizes; `continue stateloop;` would be valid
   4116                                break rawtextloop;
   4117                            case '\u0000':
   4118                                emitReplacementCharacter(buf, pos);
   4119                                continue;
   4120                            case '\r':
   4121                                emitCarriageReturn(buf, pos);
   4122                                break stateloop;
   4123                            case '\n':
   4124                                silentLineFeed();
   4125                                // CPPONLY: MOZ_FALLTHROUGH;
   4126                            default:
   4127                                /*
   4128                                 * Emit the current input character as a
   4129                                 * character token. Stay in the RAWTEXT state.
   4130                                 */
   4131                                continue;
   4132                        }
   4133                    }
   4134                    // CPPONLY: MOZ_FALLTHROUGH;
   4135                case RAWTEXT_RCDATA_LESS_THAN_SIGN:
   4136                    rawtextrcdatalessthansignloop: for (;;) {
   4137                        if (++pos == endPos) {
   4138                            break stateloop;
   4139                        }
   4140                        c = checkChar(buf, pos);
   4141                        switch (c) {
   4142                            case '/':
   4143                                /*
   4144                                 * U+002F SOLIDUS (/) Set the temporary buffer
   4145                                 * to the empty string. Switch to the script
   4146                                 * data end tag open state.
   4147                                 */
   4148                                index = 0;
   4149                                clearStrBufBeforeUse();
   4150                                state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
   4151                                // `break` optimizes; `continue stateloop;` would be valid
   4152                                break rawtextrcdatalessthansignloop;
   4153                            default:
   4154                                /*
   4155                                 * Otherwise, emit a U+003C LESS-THAN SIGN
   4156                                 * character token
   4157                                 */
   4158                                tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
   4159                                /*
   4160                                 * and reconsume the current input character in
   4161                                 * the data state.
   4162                                 */
   4163                                cstart = pos;
   4164                                reconsume = true;
   4165                                state = transition(state, returnState, reconsume, pos);
   4166                                continue stateloop;
   4167                        }
   4168                    }
   4169                    // CPPONLY: MOZ_FALLTHROUGH;
   4170                case NON_DATA_END_TAG_NAME:
   4171                    for (;;) {
   4172                        if (++pos == endPos) {
   4173                            break stateloop;
   4174                        }
   4175                        c = checkChar(buf, pos);
   4176                        /*
   4177                         * ASSERT! when entering this state, set index to 0 and
   4178                         * call clearStrBufBeforeUse(); Let's implement the above
   4179                         * without lookahead. strBuf is the 'temporary buffer'.
   4180                         */
   4181                        if (endTagExpectationAsArray == null) {
   4182                            tokenHandler.characters(Tokenizer.LT_SOLIDUS,
   4183                                    0, 2);
   4184                            cstart = pos;
   4185                            reconsume = true;
   4186                            state = transition(state, returnState, reconsume, pos);
   4187                            continue stateloop;
   4188                        } else if (index < endTagExpectationAsArray.length) {
   4189                            char e = endTagExpectationAsArray[index];
   4190                            char folded = c;
   4191                            if (c >= 'A' && c <= 'Z') {
   4192                                folded += 0x20;
   4193                            }
   4194                            if (folded != e) {
   4195                                // [NOCPP[
   4196                                errHtml4LtSlashInRcdata(folded);
   4197                                // ]NOCPP]
   4198                                tokenHandler.characters(Tokenizer.LT_SOLIDUS,
   4199                                        0, 2);
   4200                                emitStrBuf();
   4201                                cstart = pos;
   4202                                reconsume = true;
   4203                                state = transition(state, returnState, reconsume, pos);
   4204                                continue stateloop;
   4205                            }
   4206                            appendStrBuf(c);
   4207                            index++;
   4208                            continue;
   4209                        } else {
   4210                            endTag = true;
   4211                            // XXX replace contentModelElement with different
   4212                            // type
   4213                            tagName = endTagExpectation;
   4214                            switch (c) {
   4215                                case '\r':
   4216                                    silentCarriageReturn();
   4217                                    clearStrBufAfterUse(); // strBuf not used
   4218                                    state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
   4219                                    break stateloop;
   4220                                case '\n':
   4221                                    silentLineFeed();
   4222                                    // CPPONLY: MOZ_FALLTHROUGH;
   4223                                case ' ':
   4224                                case '\t':
   4225                                case '\u000C':
   4226                                    /*
   4227                                     * U+0009 CHARACTER TABULATION U+000A LINE
   4228                                     * FEED (LF) U+000C FORM FEED (FF) U+0020
   4229                                     * SPACE If the current end tag token is an
   4230                                     * appropriate end tag token, then switch to
   4231                                     * the before attribute name state.
   4232                                     */
   4233                                    clearStrBufAfterUse(); // strBuf not used
   4234                                    state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
   4235                                    continue stateloop;
   4236                                case '/':
   4237                                    /*
   4238                                     * U+002F SOLIDUS (/) If the current end tag
   4239                                     * token is an appropriate end tag token,
   4240                                     * then switch to the self-closing start tag
   4241                                     * state.
   4242                                     */
   4243                                    clearStrBufAfterUse(); // strBuf not used
   4244                                    state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
   4245                                    continue stateloop;
   4246                                case '>':
   4247                                    /*
   4248                                     * U+003E GREATER-THAN SIGN (>) If the
   4249                                     * current end tag token is an appropriate
   4250                                     * end tag token, then emit the current tag
   4251                                     * token and switch to the data state.
   4252                                     */
   4253                                    clearStrBufAfterUse(); // strBuf not used
   4254                                    state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
   4255                                    if (shouldSuspend) {
   4256                                        break stateloop;
   4257                                    }
   4258                                    continue stateloop;
   4259                                default:
   4260                                    /*
   4261                                     * Emit a U+003C LESS-THAN SIGN character
   4262                                     * token, a U+002F SOLIDUS character token,
   4263                                     * a character token for each of the
   4264                                     * characters in the temporary buffer (in
   4265                                     * the order they were added to the buffer),
   4266                                     * and reconsume the current input character
   4267                                     * in the RAWTEXT state.
   4268                                     */
   4269                                    // [NOCPP[
   4270                                    errWarnLtSlashInRcdata();
   4271                                    // ]NOCPP]
   4272                                    tokenHandler.characters(
   4273                                            Tokenizer.LT_SOLIDUS, 0, 2);
   4274                                    emitStrBuf();
   4275                                    cstart = pos; // don't drop the
   4276                                                  // character
   4277                                    reconsume = true;
   4278                                    state = transition(state, returnState, reconsume, pos);
   4279                                    continue stateloop;
   4280                            }
   4281                        }
   4282                    }
   4283                    // no fallthrough, reordering opportunity
   4284                    // BEGIN HOTSPOT WORKAROUND
   4285                case BOGUS_COMMENT:
   4286                    boguscommentloop: for (;;) {
   4287                        if (reconsume) {
   4288                            reconsume = false;
   4289                        } else {
   4290                            if (++pos == endPos) {
   4291                                break stateloop;
   4292                            }
   4293                            c = checkChar(buf, pos);
   4294                        }
   4295                        /*
   4296                         * Consume every character up to and including the first
   4297                         * U+003E GREATER-THAN SIGN character (>) or the end of
   4298                         * the file (EOF), whichever comes first. Emit a comment
   4299                         * token whose data is the concatenation of all the
   4300                         * characters starting from and including the character
   4301                         * that caused the state machine to switch into the
   4302                         * bogus comment state, up to and including the
   4303                         * character immediately before the last consumed
   4304                         * character (i.e. up to the character just before the
   4305                         * U+003E or EOF character). (If the comment was started
   4306                         * by the end of the file (EOF), the token is empty.)
   4307                         *
   4308                         * Switch to the data state.
   4309                         *
   4310                         * If the end of the file was reached, reconsume the EOF
   4311                         * character.
   4312                         */
   4313                        switch (c) {
   4314                            case '>':
   4315                                emitComment(0, pos);
   4316                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   4317                                if (shouldSuspend) {
   4318                                    break stateloop;
   4319                                }
   4320                                continue stateloop;
   4321                            case '-':
   4322                                appendStrBuf(c);
   4323                                state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
   4324                                // `break` optimizes; `continue stateloop;` would be valid
   4325                                break boguscommentloop;
   4326                            case '\r':
   4327                                appendStrBufCarriageReturn();
   4328                                break stateloop;
   4329                            case '\n':
   4330                                appendStrBufLineFeed();
   4331                                continue;
   4332                            case '\u0000':
   4333                                c = '\uFFFD';
   4334                                // CPPONLY: MOZ_FALLTHROUGH;
   4335                            default:
   4336                                appendStrBuf(c);
   4337                                continue;
   4338                        }
   4339                    }
   4340                    // CPPONLY: MOZ_FALLTHROUGH;
   4341                case BOGUS_COMMENT_HYPHEN:
   4342                    boguscommenthyphenloop: for (;;) {
   4343                        if (++pos == endPos) {
   4344                            break stateloop;
   4345                        }
   4346                        c = checkChar(buf, pos);
   4347                        switch (c) {
   4348                            case '>':
   4349                                // [NOCPP[
   4350                                maybeAppendSpaceToBogusComment();
   4351                                // ]NOCPP]
   4352                                emitComment(0, pos);
   4353                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   4354                                if (shouldSuspend) {
   4355                                    break stateloop;
   4356                                }
   4357                                continue stateloop;
   4358                            case '-':
   4359                                appendSecondHyphenToBogusComment();
   4360                                continue boguscommenthyphenloop;
   4361                            case '\r':
   4362                                appendStrBufCarriageReturn();
   4363                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
   4364                                break stateloop;
   4365                            case '\n':
   4366                                appendStrBufLineFeed();
   4367                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
   4368                                continue stateloop;
   4369                            case '\u0000':
   4370                                c = '\uFFFD';
   4371                                // CPPONLY: MOZ_FALLTHROUGH;
   4372                            default:
   4373                                appendStrBuf(c);
   4374                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
   4375                                continue stateloop;
   4376                        }
   4377                    }
   4378                    // no fallthrough, reordering opportunity
   4379                case SCRIPT_DATA:
   4380                    scriptdataloop: for (;;) {
   4381                        if (reconsume) {
   4382                            reconsume = false;
   4383                        } else {
   4384                            ++pos;
   4385                            // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today.
   4386                            // The line below advances pos by some number of code units that this state is indifferent to.
   4387                            // Using `accelerateAdvancementRawtext`, because this states has the same characters of interest as RAWTEXT.
   4388                            // CPPONLY: pos += accelerateAdvancementRawtext(buf, pos, endPos);
   4389                            if (pos == endPos) {
   4390                                break stateloop;
   4391                            }
   4392                            c = checkChar(buf, pos);
   4393                        }
   4394                        switch (c) {
   4395                            case '<':
   4396                                /*
   4397                                 * U+003C LESS-THAN SIGN (<) Switch to the
   4398                                 * script data less-than sign state.
   4399                                 */
   4400                                flushChars(buf, pos);
   4401                                returnState = state;
   4402                                state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
   4403                                // `break` optimizes; `continue stateloop;` would be valid
   4404                                break scriptdataloop;
   4405                            case '\u0000':
   4406                                emitReplacementCharacter(buf, pos);
   4407                                continue;
   4408                            case '\r':
   4409                                emitCarriageReturn(buf, pos);
   4410                                break stateloop;
   4411                            case '\n':
   4412                                silentLineFeed();
   4413                                // CPPONLY: MOZ_FALLTHROUGH;
   4414                            default:
   4415                                /*
   4416                                 * Anything else Emit the current input
   4417                                 * character as a character token. Stay in the
   4418                                 * script data state.
   4419                                 */
   4420                                continue;
   4421                        }
   4422                    }
   4423                    // CPPONLY: MOZ_FALLTHROUGH;
   4424                case SCRIPT_DATA_LESS_THAN_SIGN:
   4425                    scriptdatalessthansignloop: for (;;) {
   4426                        if (++pos == endPos) {
   4427                            break stateloop;
   4428                        }
   4429                        c = checkChar(buf, pos);
   4430                        switch (c) {
   4431                            case '/':
   4432                                /*
   4433                                 * U+002F SOLIDUS (/) Set the temporary buffer
   4434                                 * to the empty string. Switch to the script
   4435                                 * data end tag open state.
   4436                                 */
   4437                                index = 0;
   4438                                clearStrBufBeforeUse();
   4439                                state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
   4440                                continue stateloop;
   4441                            case '!':
   4442                                tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
   4443                                cstart = pos;
   4444                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
   4445                                // `break` optimizes; `continue stateloop;` would be valid
   4446                                break scriptdatalessthansignloop;
   4447                            default:
   4448                                /*
   4449                                 * Otherwise, emit a U+003C LESS-THAN SIGN
   4450                                 * character token
   4451                                 */
   4452                                tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
   4453                                /*
   4454                                 * and reconsume the current input character in
   4455                                 * the data state.
   4456                                 */
   4457                                cstart = pos;
   4458                                reconsume = true;
   4459                                state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
   4460                                continue stateloop;
   4461                        }
   4462                    }
   4463                    // CPPONLY: MOZ_FALLTHROUGH;
   4464                case SCRIPT_DATA_ESCAPE_START:
   4465                    scriptdataescapestartloop: for (;;) {
   4466                        if (++pos == endPos) {
   4467                            break stateloop;
   4468                        }
   4469                        c = checkChar(buf, pos);
   4470                        /*
   4471                         * Consume the next input character:
   4472                         */
   4473                        switch (c) {
   4474                            case '-':
   4475                                /*
   4476                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
   4477                                 * HYPHEN-MINUS character token. Switch to the
   4478                                 * script data escape start dash state.
   4479                                 */
   4480                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
   4481                                // `break` optimizes; `continue stateloop;` would be valid
   4482                                break scriptdataescapestartloop;
   4483                            default:
   4484                                /*
   4485                                 * Anything else Reconsume the current input
   4486                                 * character in the script data state.
   4487                                 */
   4488                                reconsume = true;
   4489                                state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
   4490                                continue stateloop;
   4491                        }
   4492                    }
   4493                    // CPPONLY: MOZ_FALLTHROUGH;
   4494                case SCRIPT_DATA_ESCAPE_START_DASH:
   4495                    scriptdataescapestartdashloop: for (;;) {
   4496                        if (++pos == endPos) {
   4497                            break stateloop;
   4498                        }
   4499                        c = checkChar(buf, pos);
   4500                        /*
   4501                         * Consume the next input character:
   4502                         */
   4503                        switch (c) {
   4504                            case '-':
   4505                                /*
   4506                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
   4507                                 * HYPHEN-MINUS character token. Switch to the
   4508                                 * script data escaped dash dash state.
   4509                                 */
   4510                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
   4511                                // `break` optimizes; `continue stateloop;` would be valid
   4512                                break scriptdataescapestartdashloop;
   4513                            default:
   4514                                /*
   4515                                 * Anything else Reconsume the current input
   4516                                 * character in the script data state.
   4517                                 */
   4518                                reconsume = true;
   4519                                state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
   4520                                continue stateloop;
   4521                        }
   4522                    }
   4523                    // CPPONLY: MOZ_FALLTHROUGH;
   4524                case SCRIPT_DATA_ESCAPED_DASH_DASH:
   4525                    scriptdataescapeddashdashloop: for (;;) {
   4526                        if (++pos == endPos) {
   4527                            break stateloop;
   4528                        }
   4529                        c = checkChar(buf, pos);
   4530                        /*
   4531                         * Consume the next input character:
   4532                         */
   4533                        switch (c) {
   4534                            case '-':
   4535                                /*
   4536                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
   4537                                 * HYPHEN-MINUS character token. Stay in the
   4538                                 * script data escaped dash dash state.
   4539                                 */
   4540                                continue;
   4541                            case '<':
   4542                                /*
   4543                                 * U+003C LESS-THAN SIGN (<) Switch to the
   4544                                 * script data escaped less-than sign state.
   4545                                 */
   4546                                flushChars(buf, pos);
   4547                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
   4548                                continue stateloop;
   4549                            case '>':
   4550                                /*
   4551                                 * U+003E GREATER-THAN SIGN (>) Emit a U+003E
   4552                                 * GREATER-THAN SIGN character token. Switch to
   4553                                 * the script data state.
   4554                                 */
   4555                                state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
   4556                                continue stateloop;
   4557                            case '\u0000':
   4558                                emitReplacementCharacter(buf, pos);
   4559                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
   4560                                break scriptdataescapeddashdashloop;
   4561                            case '\r':
   4562                                emitCarriageReturn(buf, pos);
   4563                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
   4564                                break stateloop;
   4565                            case '\n':
   4566                                silentLineFeed();
   4567                                // CPPONLY: MOZ_FALLTHROUGH;
   4568                            default:
   4569                                /*
   4570                                 * Anything else Emit the current input
   4571                                 * character as a character token. Switch to the
   4572                                 * script data escaped state.
   4573                                 */
   4574                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
   4575                                // `break` optimizes; `continue stateloop;` would be valid
   4576                                break scriptdataescapeddashdashloop;
   4577                        }
   4578                    }
   4579                    // CPPONLY: MOZ_FALLTHROUGH;
   4580                case SCRIPT_DATA_ESCAPED:
   4581                    scriptdataescapedloop: for (;;) {
   4582                        if (reconsume) {
   4583                            reconsume = false;
   4584                        } else {
   4585                            ++pos;
   4586                            // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today.
   4587                            // The line below advances pos by some number of code units that this state is indifferent to.
   4588                            // CPPONLY: pos += accelerateAdvancementScriptDataEscaped(buf, pos, endPos);
   4589                            if (pos == endPos) {
   4590                                break stateloop;
   4591                            }
   4592                            c = checkChar(buf, pos);
   4593                        }
   4594                        /*
   4595                         * Consume the next input character:
   4596                         */
   4597                        switch (c) {
   4598                            case '-':
   4599                                /*
   4600                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
   4601                                 * HYPHEN-MINUS character token. Switch to the
   4602                                 * script data escaped dash state.
   4603                                 */
   4604                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
   4605                                // `break` optimizes; `continue stateloop;` would be valid
   4606                                break scriptdataescapedloop;
   4607                            case '<':
   4608                                /*
   4609                                 * U+003C LESS-THAN SIGN (<) Switch to the
   4610                                 * script data escaped less-than sign state.
   4611                                 */
   4612                                flushChars(buf, pos);
   4613                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
   4614                                continue stateloop;
   4615                            case '\u0000':
   4616                                emitReplacementCharacter(buf, pos);
   4617                                continue;
   4618                            case '\r':
   4619                                emitCarriageReturn(buf, pos);
   4620                                break stateloop;
   4621                            case '\n':
   4622                                silentLineFeed();
   4623                                // CPPONLY: MOZ_FALLTHROUGH;
   4624                            default:
   4625                                /*
   4626                                 * Anything else Emit the current input
   4627                                 * character as a character token. Stay in the
   4628                                 * script data escaped state.
   4629                                 */
   4630                                continue;
   4631                        }
   4632                    }
   4633                    // CPPONLY: MOZ_FALLTHROUGH;
   4634                case SCRIPT_DATA_ESCAPED_DASH:
   4635                    scriptdataescapeddashloop: for (;;) {
   4636                        if (++pos == endPos) {
   4637                            break stateloop;
   4638                        }
   4639                        c = checkChar(buf, pos);
   4640                        /*
   4641                         * Consume the next input character:
   4642                         */
   4643                        switch (c) {
   4644                            case '-':
   4645                                /*
   4646                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
   4647                                 * HYPHEN-MINUS character token. Switch to the
   4648                                 * script data escaped dash dash state.
   4649                                 */
   4650                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
   4651                                continue stateloop;
   4652                            case '<':
   4653                                /*
   4654                                 * U+003C LESS-THAN SIGN (<) Switch to the
   4655                                 * script data escaped less-than sign state.
   4656                                 */
   4657                                flushChars(buf, pos);
   4658                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
   4659                                // `break` optimizes; `continue stateloop;` would be valid
   4660                                break scriptdataescapeddashloop;
   4661                            case '\u0000':
   4662                                emitReplacementCharacter(buf, pos);
   4663                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
   4664                                continue stateloop;
   4665                            case '\r':
   4666                                emitCarriageReturn(buf, pos);
   4667                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
   4668                                break stateloop;
   4669                            case '\n':
   4670                                silentLineFeed();
   4671                                // CPPONLY: MOZ_FALLTHROUGH;
   4672                            default:
   4673                                /*
   4674                                 * Anything else Emit the current input
   4675                                 * character as a character token. Switch to the
   4676                                 * script data escaped state.
   4677                                 */
   4678                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
   4679                                continue stateloop;
   4680                        }
   4681                    }
   4682                    // CPPONLY: MOZ_FALLTHROUGH;
   4683                case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
   4684                    scriptdataescapedlessthanloop: for (;;) {
   4685                        if (++pos == endPos) {
   4686                            break stateloop;
   4687                        }
   4688                        c = checkChar(buf, pos);
   4689                        /*
   4690                         * Consume the next input character:
   4691                         */
   4692                        switch (c) {
   4693                            case '/':
   4694                                /*
   4695                                 * U+002F SOLIDUS (/) Set the temporary buffer
   4696                                 * to the empty string. Switch to the script
   4697                                 * data escaped end tag open state.
   4698                                 */
   4699                                index = 0;
   4700                                clearStrBufBeforeUse();
   4701                                returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
   4702                                state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
   4703                                continue stateloop;
   4704                            case 'S':
   4705                            case 's':
   4706                                /*
   4707                                 * U+0041 LATIN CAPITAL LETTER A through to
   4708                                 * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
   4709                                 * LESS-THAN SIGN character token and the
   4710                                 * current input character as a character token.
   4711                                 */
   4712                                tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
   4713                                cstart = pos;
   4714                                index = 1;
   4715                                /*
   4716                                 * Set the temporary buffer to the empty string.
   4717                                 * Append the lowercase version of the current
   4718                                 * input character (add 0x0020 to the
   4719                                 * character's code point) to the temporary
   4720                                 * buffer. Switch to the script data double
   4721                                 * escape start state.
   4722                                 */
   4723                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
   4724                                // `break` optimizes; `continue stateloop;` would be valid
   4725                                break scriptdataescapedlessthanloop;
   4726                            default:
   4727                                /*
   4728                                 * Anything else Emit a U+003C LESS-THAN SIGN
   4729                                 * character token and reconsume the current
   4730                                 * input character in the script data escaped
   4731                                 * state.
   4732                                 */
   4733                                tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
   4734                                cstart = pos;
   4735                                reconsume = true;
   4736                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
   4737                                continue stateloop;
   4738                        }
   4739                    }
   4740                    // CPPONLY: MOZ_FALLTHROUGH;
   4741                case SCRIPT_DATA_DOUBLE_ESCAPE_START:
   4742                    scriptdatadoubleescapestartloop: for (;;) {
   4743                        if (++pos == endPos) {
   4744                            break stateloop;
   4745                        }
   4746                        c = checkChar(buf, pos);
   4747                        assert index > 0;
   4748                        if (index < 6) { // SCRIPT_ARR.length
   4749                            char folded = c;
   4750                            if (c >= 'A' && c <= 'Z') {
   4751                                folded += 0x20;
   4752                            }
   4753                            if (folded != Tokenizer.SCRIPT_ARR[index]) {
   4754                                reconsume = true;
   4755                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
   4756                                continue stateloop;
   4757                            }
   4758                            index++;
   4759                            continue;
   4760                        }
   4761                        switch (c) {
   4762                            case '\r':
   4763                                emitCarriageReturn(buf, pos);
   4764                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
   4765                                break stateloop;
   4766                            case '\n':
   4767                                silentLineFeed();
   4768                                // CPPONLY: MOZ_FALLTHROUGH;
   4769                            case ' ':
   4770                            case '\t':
   4771                            case '\u000C':
   4772                            case '/':
   4773                            case '>':
   4774                                /*
   4775                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   4776                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
   4777                                 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
   4778                                 * (>) Emit the current input character as a
   4779                                 * character token. If the temporary buffer is
   4780                                 * the string "script", then switch to the
   4781                                 * script data double escaped state.
   4782                                 */
   4783                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
   4784                                // `break` optimizes; `continue stateloop;` would be valid
   4785                                break scriptdatadoubleescapestartloop;
   4786                            default:
   4787                                /*
   4788                                 * Anything else Reconsume the current input
   4789                                 * character in the script data escaped state.
   4790                                 */
   4791                                reconsume = true;
   4792                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
   4793                                continue stateloop;
   4794                        }
   4795                    }
   4796                    // CPPONLY: MOZ_FALLTHROUGH;
   4797                case SCRIPT_DATA_DOUBLE_ESCAPED:
   4798                    scriptdatadoubleescapedloop: for (;;) {
   4799                        if (reconsume) {
   4800                            reconsume = false;
   4801                        } else {
   4802                            if (++pos == endPos) {
   4803                                break stateloop;
   4804                            }
   4805                            c = checkChar(buf, pos);
   4806                        }
   4807                        /*
   4808                         * Consume the next input character:
   4809                         */
   4810                        switch (c) {
   4811                            case '-':
   4812                                /*
   4813                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
   4814                                 * HYPHEN-MINUS character token. Switch to the
   4815                                 * script data double escaped dash state.
   4816                                 */
   4817                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
   4818                                // `break` optimizes; `continue stateloop;` would be valid
   4819                                break scriptdatadoubleescapedloop;
   4820                            case '<':
   4821                                /*
   4822                                 * U+003C LESS-THAN SIGN (<) Emit a U+003C
   4823                                 * LESS-THAN SIGN character token. Switch to the
   4824                                 * script data double escaped less-than sign
   4825                                 * state.
   4826                                 */
   4827                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
   4828                                continue stateloop;
   4829                            case '\u0000':
   4830                                emitReplacementCharacter(buf, pos);
   4831                                continue;
   4832                            case '\r':
   4833                                emitCarriageReturn(buf, pos);
   4834                                break stateloop;
   4835                            case '\n':
   4836                                silentLineFeed();
   4837                                // CPPONLY: MOZ_FALLTHROUGH;
   4838                            default:
   4839                                /*
   4840                                 * Anything else Emit the current input
   4841                                 * character as a character token. Stay in the
   4842                                 * script data double escaped state.
   4843                                 */
   4844                                continue;
   4845                        }
   4846                    }
   4847                    // CPPONLY: MOZ_FALLTHROUGH;
   4848                case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
   4849                    scriptdatadoubleescapeddashloop: for (;;) {
   4850                        if (++pos == endPos) {
   4851                            break stateloop;
   4852                        }
   4853                        c = checkChar(buf, pos);
   4854                        /*
   4855                         * Consume the next input character:
   4856                         */
   4857                        switch (c) {
   4858                            case '-':
   4859                                /*
   4860                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
   4861                                 * HYPHEN-MINUS character token. Switch to the
   4862                                 * script data double escaped dash dash state.
   4863                                 */
   4864                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
   4865                                // `break` optimizes; `continue stateloop;` would be valid
   4866                                break scriptdatadoubleescapeddashloop;
   4867                            case '<':
   4868                                /*
   4869                                 * U+003C LESS-THAN SIGN (<) Emit a U+003C
   4870                                 * LESS-THAN SIGN character token. Switch to the
   4871                                 * script data double escaped less-than sign
   4872                                 * state.
   4873                                 */
   4874                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
   4875                                continue stateloop;
   4876                            case '\u0000':
   4877                                emitReplacementCharacter(buf, pos);
   4878                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
   4879                                continue stateloop;
   4880                            case '\r':
   4881                                emitCarriageReturn(buf, pos);
   4882                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
   4883                                break stateloop;
   4884                            case '\n':
   4885                                silentLineFeed();
   4886                                // CPPONLY: MOZ_FALLTHROUGH;
   4887                            default:
   4888                                /*
   4889                                 * Anything else Emit the current input
   4890                                 * character as a character token. Switch to the
   4891                                 * script data double escaped state.
   4892                                 */
   4893                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
   4894                                continue stateloop;
   4895                        }
   4896                    }
   4897                    // CPPONLY: MOZ_FALLTHROUGH;
   4898                case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
   4899                    scriptdatadoubleescapeddashdashloop: for (;;) {
   4900                        if (++pos == endPos) {
   4901                            break stateloop;
   4902                        }
   4903                        c = checkChar(buf, pos);
   4904                        /*
   4905                         * Consume the next input character:
   4906                         */
   4907                        switch (c) {
   4908                            case '-':
   4909                                /*
   4910                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
   4911                                 * HYPHEN-MINUS character token. Stay in the
   4912                                 * script data double escaped dash dash state.
   4913                                 */
   4914                                continue;
   4915                            case '<':
   4916                                /*
   4917                                 * U+003C LESS-THAN SIGN (<) Emit a U+003C
   4918                                 * LESS-THAN SIGN character token. Switch to the
   4919                                 * script data double escaped less-than sign
   4920                                 * state.
   4921                                 */
   4922                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
   4923                                // `break` optimizes; `continue stateloop;` would be valid
   4924                                break scriptdatadoubleescapeddashdashloop;
   4925                            case '>':
   4926                                /*
   4927                                 * U+003E GREATER-THAN SIGN (>) Emit a U+003E
   4928                                 * GREATER-THAN SIGN character token. Switch to
   4929                                 * the script data state.
   4930                                 */
   4931                                state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
   4932                                continue stateloop;
   4933                            case '\u0000':
   4934                                emitReplacementCharacter(buf, pos);
   4935                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
   4936                                continue stateloop;
   4937                            case '\r':
   4938                                emitCarriageReturn(buf, pos);
   4939                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
   4940                                break stateloop;
   4941                            case '\n':
   4942                                silentLineFeed();
   4943                                // CPPONLY: MOZ_FALLTHROUGH;
   4944                            default:
   4945                                /*
   4946                                 * Anything else Emit the current input
   4947                                 * character as a character token. Switch to the
   4948                                 * script data double escaped state.
   4949                                 */
   4950                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
   4951                                continue stateloop;
   4952                        }
   4953                    }
   4954                    // CPPONLY: MOZ_FALLTHROUGH;
   4955                case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
   4956                    scriptdatadoubleescapedlessthanloop: for (;;) {
   4957                        if (++pos == endPos) {
   4958                            break stateloop;
   4959                        }
   4960                        c = checkChar(buf, pos);
   4961                        /*
   4962                         * Consume the next input character:
   4963                         */
   4964                        switch (c) {
   4965                            case '/':
   4966                                /*
   4967                                 * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
   4968                                 * character token. Set the temporary buffer to
   4969                                 * the empty string. Switch to the script data
   4970                                 * double escape end state.
   4971                                 */
   4972                                index = 0;
   4973                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
   4974                                // `break` optimizes; `continue stateloop;` would be valid
   4975                                break scriptdatadoubleescapedlessthanloop;
   4976                            default:
   4977                                /*
   4978                                 * Anything else Reconsume the current input
   4979                                 * character in the script data double escaped
   4980                                 * state.
   4981                                 */
   4982                                reconsume = true;
   4983                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
   4984                                continue stateloop;
   4985                        }
   4986                    }
   4987                    // CPPONLY: MOZ_FALLTHROUGH;
   4988                case SCRIPT_DATA_DOUBLE_ESCAPE_END:
   4989                    scriptdatadoubleescapeendloop: for (;;) {
   4990                        if (++pos == endPos) {
   4991                            break stateloop;
   4992                        }
   4993                        c = checkChar(buf, pos);
   4994                        if (index < 6) { // SCRIPT_ARR.length
   4995                            char folded = c;
   4996                            if (c >= 'A' && c <= 'Z') {
   4997                                folded += 0x20;
   4998                            }
   4999                            if (folded != Tokenizer.SCRIPT_ARR[index]) {
   5000                                reconsume = true;
   5001                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
   5002                                continue stateloop;
   5003                            }
   5004                            index++;
   5005                            continue;
   5006                        }
   5007                        switch (c) {
   5008                            case '\r':
   5009                                emitCarriageReturn(buf, pos);
   5010                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
   5011                                break stateloop;
   5012                            case '\n':
   5013                                silentLineFeed();
   5014                                // CPPONLY: MOZ_FALLTHROUGH;
   5015                            case ' ':
   5016                            case '\t':
   5017                            case '\u000C':
   5018                            case '/':
   5019                            case '>':
   5020                                /*
   5021                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   5022                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
   5023                                 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
   5024                                 * (>) Emit the current input character as a
   5025                                 * character token. If the temporary buffer is
   5026                                 * the string "script", then switch to the
   5027                                 * script data escaped state.
   5028                                 */
   5029                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
   5030                                continue stateloop;
   5031                            default:
   5032                                /*
   5033                                 * Reconsume the current input character in the
   5034                                 * script data double escaped state.
   5035                                 */
   5036                                reconsume = true;
   5037                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
   5038                                continue stateloop;
   5039                        }
   5040                    }
   5041                    // no fallthrough, reordering opportunity
   5042                case MARKUP_DECLARATION_OCTYPE:
   5043                    markupdeclarationdoctypeloop: for (;;) {
   5044                        if (++pos == endPos) {
   5045                            break stateloop;
   5046                        }
   5047                        c = checkChar(buf, pos);
   5048                        if (index < 6) { // OCTYPE.length
   5049                            char folded = c;
   5050                            if (c >= 'A' && c <= 'Z') {
   5051                                folded += 0x20;
   5052                            }
   5053                            if (folded == Tokenizer.OCTYPE[index]) {
   5054                                appendStrBuf(c);
   5055                            } else {
   5056                                errBogusComment();
   5057                                reconsume = true;
   5058                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
   5059                                continue stateloop;
   5060                            }
   5061                            index++;
   5062                            continue;
   5063                        } else {
   5064                            reconsume = true;
   5065                            state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
   5066                            // `break` optimizes; `continue stateloop;` would be valid
   5067                            break markupdeclarationdoctypeloop;
   5068                        }
   5069                    }
   5070                    // CPPONLY: MOZ_FALLTHROUGH;
   5071                case DOCTYPE:
   5072                    doctypeloop: for (;;) {
   5073                        if (reconsume) {
   5074                            reconsume = false;
   5075                        } else {
   5076                            if (++pos == endPos) {
   5077                                break stateloop;
   5078                            }
   5079                            c = checkChar(buf, pos);
   5080                        }
   5081                        initDoctypeFields();
   5082                        /*
   5083                         * Consume the next input character:
   5084                         */
   5085                        switch (c) {
   5086                            case '\r':
   5087                                silentCarriageReturn();
   5088                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
   5089                                break stateloop;
   5090                            case '\n':
   5091                                silentLineFeed();
   5092                                // CPPONLY: MOZ_FALLTHROUGH;
   5093                            case ' ':
   5094                            case '\t':
   5095                            case '\u000C':
   5096                                /*
   5097                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   5098                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
   5099                                 * Switch to the before DOCTYPE name state.
   5100                                 */
   5101                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
   5102                                // `break` optimizes; `continue stateloop;` would be valid
   5103                                break doctypeloop;
   5104                            default:
   5105                                /*
   5106                                 * Anything else Parse error.
   5107                                 */
   5108                                errMissingSpaceBeforeDoctypeName();
   5109                                /*
   5110                                 * Reconsume the current character in the before
   5111                                 * DOCTYPE name state.
   5112                                 */
   5113                                reconsume = true;
   5114                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
   5115                                // `break` optimizes; `continue stateloop;` would be valid
   5116                                break doctypeloop;
   5117                        }
   5118                    }
   5119                    // CPPONLY: MOZ_FALLTHROUGH;
   5120                case BEFORE_DOCTYPE_NAME:
   5121                    beforedoctypenameloop: for (;;) {
   5122                        if (reconsume) {
   5123                            reconsume = false;
   5124                        } else {
   5125                            if (++pos == endPos) {
   5126                                break stateloop;
   5127                            }
   5128                            c = checkChar(buf, pos);
   5129                        }
   5130                        /*
   5131                         * Consume the next input character:
   5132                         */
   5133                        switch (c) {
   5134                            case '\r':
   5135                                silentCarriageReturn();
   5136                                break stateloop;
   5137                            case '\n':
   5138                                silentLineFeed();
   5139                                // CPPONLY: MOZ_FALLTHROUGH;
   5140                            case ' ':
   5141                            case '\t':
   5142                            case '\u000C':
   5143                                /*
   5144                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   5145                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
   5146                                 * in the before DOCTYPE name state.
   5147                                 */
   5148                                continue;
   5149                            case '>':
   5150                                /*
   5151                                 * U+003E GREATER-THAN SIGN (>) Parse error.
   5152                                 */
   5153                                errNamelessDoctype();
   5154                                /*
   5155                                 * Create a new DOCTYPE token. Set its
   5156                                 * force-quirks flag to on.
   5157                                 */
   5158                                forceQuirks = true;
   5159                                /*
   5160                                 * Emit the token.
   5161                                 */
   5162                                emitDoctypeToken(pos);
   5163                                /*
   5164                                 * Switch to the data state.
   5165                                 */
   5166                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   5167                                if (shouldSuspend) {
   5168                                    break stateloop;
   5169                                }
   5170                                continue stateloop;
   5171                            case '\u0000':
   5172                                c = '\uFFFD';
   5173                                // CPPONLY: MOZ_FALLTHROUGH;
   5174                            default:
   5175                                if (c >= 'A' && c <= 'Z') {
   5176                                    /*
   5177                                     * U+0041 LATIN CAPITAL LETTER A through to
   5178                                     * U+005A LATIN CAPITAL LETTER Z Create a
   5179                                     * new DOCTYPE token. Set the token's name
   5180                                     * to the lowercase version of the input
   5181                                     * character (add 0x0020 to the character's
   5182                                     * code point).
   5183                                     */
   5184                                    c += 0x20;
   5185                                }
   5186                                /* Anything else Create a new DOCTYPE token. */
   5187                                /*
   5188                                 * Set the token's name name to the current
   5189                                 * input character.
   5190                                 */
   5191                                clearStrBufBeforeUse();
   5192                                appendStrBuf(c);
   5193                                /*
   5194                                 * Switch to the DOCTYPE name state.
   5195                                 */
   5196                                state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
   5197                                // `break` optimizes; `continue stateloop;` would be valid
   5198                                break beforedoctypenameloop;
   5199                        }
   5200                    }
   5201                    // CPPONLY: MOZ_FALLTHROUGH;
   5202                case DOCTYPE_NAME:
   5203                    doctypenameloop: for (;;) {
   5204                        if (++pos == endPos) {
   5205                            break stateloop;
   5206                        }
   5207                        c = checkChar(buf, pos);
   5208                        /*
   5209                         * Consume the next input character:
   5210                         */
   5211                        switch (c) {
   5212                            case '\r':
   5213                                silentCarriageReturn();
   5214                                strBufToDoctypeName();
   5215                                state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
   5216                                break stateloop;
   5217                            case '\n':
   5218                                silentLineFeed();
   5219                                // CPPONLY: MOZ_FALLTHROUGH;
   5220                            case ' ':
   5221                            case '\t':
   5222                            case '\u000C':
   5223                                /*
   5224                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   5225                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
   5226                                 * Switch to the after DOCTYPE name state.
   5227                                 */
   5228                                strBufToDoctypeName();
   5229                                state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
   5230                                // `break` optimizes; `continue stateloop;` would be valid
   5231                                break doctypenameloop;
   5232                            case '>':
   5233                                /*
   5234                                 * U+003E GREATER-THAN SIGN (>) Emit the current
   5235                                 * DOCTYPE token.
   5236                                 */
   5237                                strBufToDoctypeName();
   5238                                emitDoctypeToken(pos);
   5239                                /*
   5240                                 * Switch to the data state.
   5241                                 */
   5242                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   5243                                if (shouldSuspend) {
   5244                                    break stateloop;
   5245                                }
   5246                                continue stateloop;
   5247                            case '\u0000':
   5248                                c = '\uFFFD';
   5249                                // CPPONLY: MOZ_FALLTHROUGH;
   5250                            default:
   5251                                /*
   5252                                 * U+0041 LATIN CAPITAL LETTER A through to
   5253                                 * U+005A LATIN CAPITAL LETTER Z Append the
   5254                                 * lowercase version of the input character (add
   5255                                 * 0x0020 to the character's code point) to the
   5256                                 * current DOCTYPE token's name.
   5257                                 */
   5258                                if (c >= 'A' && c <= 'Z') {
   5259                                    c += 0x0020;
   5260                                }
   5261                                /*
   5262                                 * Anything else Append the current input
   5263                                 * character to the current DOCTYPE token's
   5264                                 * name.
   5265                                 */
   5266                                appendStrBuf(c);
   5267                                /*
   5268                                 * Stay in the DOCTYPE name state.
   5269                                 */
   5270                                continue;
   5271                        }
   5272                    }
   5273                    // CPPONLY: MOZ_FALLTHROUGH;
   5274                case AFTER_DOCTYPE_NAME:
   5275                    afterdoctypenameloop: for (;;) {
   5276                        if (++pos == endPos) {
   5277                            break stateloop;
   5278                        }
   5279                        c = checkChar(buf, pos);
   5280                        /*
   5281                         * Consume the next input character:
   5282                         */
   5283                        switch (c) {
   5284                            case '\r':
   5285                                silentCarriageReturn();
   5286                                break stateloop;
   5287                            case '\n':
   5288                                silentLineFeed();
   5289                                // CPPONLY: MOZ_FALLTHROUGH;
   5290                            case ' ':
   5291                            case '\t':
   5292                            case '\u000C':
   5293                                /*
   5294                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   5295                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
   5296                                 * in the after DOCTYPE name state.
   5297                                 */
   5298                                continue;
   5299                            case '>':
   5300                                /*
   5301                                 * U+003E GREATER-THAN SIGN (>) Emit the current
   5302                                 * DOCTYPE token.
   5303                                 */
   5304                                emitDoctypeToken(pos);
   5305                                /*
   5306                                 * Switch to the data state.
   5307                                 */
   5308                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   5309                                if (shouldSuspend) {
   5310                                    break stateloop;
   5311                                }
   5312                                continue stateloop;
   5313                            case 'p':
   5314                            case 'P':
   5315                                index = 0;
   5316                                state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
   5317                                // `break` optimizes; `continue stateloop;` would be valid
   5318                                break afterdoctypenameloop;
   5319                            case 's':
   5320                            case 'S':
   5321                                index = 0;
   5322                                state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
   5323                                continue stateloop;
   5324                            default:
   5325                                /*
   5326                                 * Otherwise, this is the parse error.
   5327                                 */
   5328                                bogusDoctype();
   5329 
   5330                                /*
   5331                                 * Set the DOCTYPE token's force-quirks flag to
   5332                                 * on.
   5333                                 */
   5334                                // done by bogusDoctype();
   5335                                /*
   5336                                 * Switch to the bogus DOCTYPE state.
   5337                                 */
   5338                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
   5339                                continue stateloop;
   5340                        }
   5341                    }
   5342                    // CPPONLY: MOZ_FALLTHROUGH;
   5343                case DOCTYPE_UBLIC:
   5344                    doctypeublicloop: for (;;) {
   5345                        if (++pos == endPos) {
   5346                            break stateloop;
   5347                        }
   5348                        c = checkChar(buf, pos);
   5349                        /*
   5350                         * If the six characters starting from the current input
   5351                         * character are an ASCII case-insensitive match for the
   5352                         * word "PUBLIC", then consume those characters and
   5353                         * switch to the before DOCTYPE public identifier state.
   5354                         */
   5355                        if (index < 5) { // UBLIC.length
   5356                            char folded = c;
   5357                            if (c >= 'A' && c <= 'Z') {
   5358                                folded += 0x20;
   5359                            }
   5360                            if (folded != Tokenizer.UBLIC[index]) {
   5361                                bogusDoctype();
   5362                                // forceQuirks = true;
   5363                                reconsume = true;
   5364                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
   5365                                continue stateloop;
   5366                            }
   5367                            index++;
   5368                            continue;
   5369                        } else {
   5370                            reconsume = true;
   5371                            state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
   5372                            // `break` optimizes; `continue stateloop;` would be valid
   5373                            break doctypeublicloop;
   5374                        }
   5375                    }
   5376                    // CPPONLY: MOZ_FALLTHROUGH;
   5377                case AFTER_DOCTYPE_PUBLIC_KEYWORD:
   5378                    afterdoctypepublickeywordloop: for (;;) {
   5379                        if (reconsume) {
   5380                            reconsume = false;
   5381                        } else {
   5382                            if (++pos == endPos) {
   5383                                break stateloop;
   5384                            }
   5385                            c = checkChar(buf, pos);
   5386                        }
   5387                        /*
   5388                         * Consume the next input character:
   5389                         */
   5390                        switch (c) {
   5391                            case '\r':
   5392                                silentCarriageReturn();
   5393                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
   5394                                break stateloop;
   5395                            case '\n':
   5396                                silentLineFeed();
   5397                                // CPPONLY: MOZ_FALLTHROUGH;
   5398                            case ' ':
   5399                            case '\t':
   5400                            case '\u000C':
   5401                                /*
   5402                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   5403                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
   5404                                 * Switch to the before DOCTYPE public
   5405                                 * identifier state.
   5406                                 */
   5407                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
   5408                                // `break` optimizes; `continue stateloop;` would be valid
   5409                                break afterdoctypepublickeywordloop;
   5410                            case '"':
   5411                                /*
   5412                                 * U+0022 QUOTATION MARK (") Parse Error.
   5413                                 */
   5414                                errNoSpaceBetweenDoctypePublicKeywordAndQuote();
   5415                                /*
   5416                                 * Set the DOCTYPE token's public identifier to
   5417                                 * the empty string (not missing),
   5418                                 */
   5419                                clearStrBufBeforeUse();
   5420                                /*
   5421                                 * then switch to the DOCTYPE public identifier
   5422                                 * (double-quoted) state.
   5423                                 */
   5424                                state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
   5425                                continue stateloop;
   5426                            case '\'':
   5427                                /*
   5428                                 * U+0027 APOSTROPHE (') Parse Error.
   5429                                 */
   5430                                errNoSpaceBetweenDoctypePublicKeywordAndQuote();
   5431                                /*
   5432                                 * Set the DOCTYPE token's public identifier to
   5433                                 * the empty string (not missing),
   5434                                 */
   5435                                clearStrBufBeforeUse();
   5436                                /*
   5437                                 * then switch to the DOCTYPE public identifier
   5438                                 * (single-quoted) state.
   5439                                 */
   5440                                state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
   5441                                continue stateloop;
   5442                            case '>':
   5443                                /* U+003E GREATER-THAN SIGN (>) Parse error. */
   5444                                errExpectedPublicId();
   5445                                /*
   5446                                 * Set the DOCTYPE token's force-quirks flag to
   5447                                 * on.
   5448                                 */
   5449                                forceQuirks = true;
   5450                                /*
   5451                                 * Emit that DOCTYPE token.
   5452                                 */
   5453                                emitDoctypeToken(pos);
   5454                                /*
   5455                                 * Switch to the data state.
   5456                                 */
   5457                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   5458                                if (shouldSuspend) {
   5459                                    break stateloop;
   5460                                }
   5461                                continue stateloop;
   5462                            default:
   5463                                bogusDoctype();
   5464                                /*
   5465                                 * Set the DOCTYPE token's force-quirks flag to
   5466                                 * on.
   5467                                 */
   5468                                // done by bogusDoctype();
   5469                                /*
   5470                                 * Switch to the bogus DOCTYPE state.
   5471                                 */
   5472                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
   5473                                continue stateloop;
   5474                        }
   5475                    }
   5476                    // CPPONLY: MOZ_FALLTHROUGH;
   5477                case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
   5478                    beforedoctypepublicidentifierloop: for (;;) {
   5479                        if (++pos == endPos) {
   5480                            break stateloop;
   5481                        }
   5482                        c = checkChar(buf, pos);
   5483                        /*
   5484                         * Consume the next input character:
   5485                         */
   5486                        switch (c) {
   5487                            case '\r':
   5488                                silentCarriageReturn();
   5489                                break stateloop;
   5490                            case '\n':
   5491                                silentLineFeed();
   5492                                // CPPONLY: MOZ_FALLTHROUGH;
   5493                            case ' ':
   5494                            case '\t':
   5495                            case '\u000C':
   5496                                /*
   5497                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   5498                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
   5499                                 * in the before DOCTYPE public identifier
   5500                                 * state.
   5501                                 */
   5502                                continue;
   5503                            case '"':
   5504                                /*
   5505                                 * U+0022 QUOTATION MARK (") Set the DOCTYPE
   5506                                 * token's public identifier to the empty string
   5507                                 * (not missing),
   5508                                 */
   5509                                clearStrBufBeforeUse();
   5510                                /*
   5511                                 * then switch to the DOCTYPE public identifier
   5512                                 * (double-quoted) state.
   5513                                 */
   5514                                state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
   5515                                // `break` optimizes; `continue stateloop;` would be valid
   5516                                break beforedoctypepublicidentifierloop;
   5517                            case '\'':
   5518                                /*
   5519                                 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
   5520                                 * public identifier to the empty string (not
   5521                                 * missing),
   5522                                 */
   5523                                clearStrBufBeforeUse();
   5524                                /*
   5525                                 * then switch to the DOCTYPE public identifier
   5526                                 * (single-quoted) state.
   5527                                 */
   5528                                state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
   5529                                continue stateloop;
   5530                            case '>':
   5531                                /* U+003E GREATER-THAN SIGN (>) Parse error. */
   5532                                errExpectedPublicId();
   5533                                /*
   5534                                 * Set the DOCTYPE token's force-quirks flag to
   5535                                 * on.
   5536                                 */
   5537                                forceQuirks = true;
   5538                                /*
   5539                                 * Emit that DOCTYPE token.
   5540                                 */
   5541                                emitDoctypeToken(pos);
   5542                                /*
   5543                                 * Switch to the data state.
   5544                                 */
   5545                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   5546                                if (shouldSuspend) {
   5547                                    break stateloop;
   5548                                }
   5549                                continue stateloop;
   5550                            default:
   5551                                bogusDoctype();
   5552                                /*
   5553                                 * Set the DOCTYPE token's force-quirks flag to
   5554                                 * on.
   5555                                 */
   5556                                // done by bogusDoctype();
   5557                                /*
   5558                                 * Switch to the bogus DOCTYPE state.
   5559                                 */
   5560                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
   5561                                continue stateloop;
   5562                        }
   5563                    }
   5564                    // CPPONLY: MOZ_FALLTHROUGH;
   5565                case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
   5566                    doctypepublicidentifierdoublequotedloop: for (;;) {
   5567                        if (++pos == endPos) {
   5568                            break stateloop;
   5569                        }
   5570                        c = checkChar(buf, pos);
   5571                        /*
   5572                         * Consume the next input character:
   5573                         */
   5574                        switch (c) {
   5575                            case '"':
   5576                                /*
   5577                                 * U+0022 QUOTATION MARK (") Switch to the after
   5578                                 * DOCTYPE public identifier state.
   5579                                 */
   5580                                publicIdentifier = strBufToString();
   5581                                state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
   5582                                // `break` optimizes; `continue stateloop;` would be valid
   5583                                break doctypepublicidentifierdoublequotedloop;
   5584                            case '>':
   5585                                /*
   5586                                 * U+003E GREATER-THAN SIGN (>) Parse error.
   5587                                 */
   5588                                errGtInPublicId();
   5589                                /*
   5590                                 * Set the DOCTYPE token's force-quirks flag to
   5591                                 * on.
   5592                                 */
   5593                                forceQuirks = true;
   5594                                /*
   5595                                 * Emit that DOCTYPE token.
   5596                                 */
   5597                                publicIdentifier = strBufToString();
   5598                                emitDoctypeToken(pos);
   5599                                /*
   5600                                 * Switch to the data state.
   5601                                 */
   5602                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   5603                                if (shouldSuspend) {
   5604                                    break stateloop;
   5605                                }
   5606                                continue stateloop;
   5607                            case '\r':
   5608                                appendStrBufCarriageReturn();
   5609                                break stateloop;
   5610                            case '\n':
   5611                                appendStrBufLineFeed();
   5612                                continue;
   5613                            case '\u0000':
   5614                                c = '\uFFFD';
   5615                                // CPPONLY: MOZ_FALLTHROUGH;
   5616                            default:
   5617                                /*
   5618                                 * Anything else Append the current input
   5619                                 * character to the current DOCTYPE token's
   5620                                 * public identifier.
   5621                                 */
   5622                                appendStrBuf(c);
   5623                                /*
   5624                                 * Stay in the DOCTYPE public identifier
   5625                                 * (double-quoted) state.
   5626                                 */
   5627                                continue;
   5628                        }
   5629                    }
   5630                    // CPPONLY: MOZ_FALLTHROUGH;
   5631                case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
   5632                    afterdoctypepublicidentifierloop: for (;;) {
   5633                        if (++pos == endPos) {
   5634                            break stateloop;
   5635                        }
   5636                        c = checkChar(buf, pos);
   5637                        /*
   5638                         * Consume the next input character:
   5639                         */
   5640                        switch (c) {
   5641                            case '\r':
   5642                                silentCarriageReturn();
   5643                                state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
   5644                                break stateloop;
   5645                            case '\n':
   5646                                silentLineFeed();
   5647                                // CPPONLY: MOZ_FALLTHROUGH;
   5648                            case ' ':
   5649                            case '\t':
   5650                            case '\u000C':
   5651                                /*
   5652                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   5653                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
   5654                                 * Switch to the between DOCTYPE public and
   5655                                 * system identifiers state.
   5656                                 */
   5657                                state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
   5658                                // `break` optimizes; `continue stateloop;` would be valid
   5659                                break afterdoctypepublicidentifierloop;
   5660                            case '>':
   5661                                /*
   5662                                 * U+003E GREATER-THAN SIGN (>) Emit the current
   5663                                 * DOCTYPE token.
   5664                                 */
   5665                                emitDoctypeToken(pos);
   5666                                /*
   5667                                 * Switch to the data state.
   5668                                 */
   5669                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   5670                                if (shouldSuspend) {
   5671                                    break stateloop;
   5672                                }
   5673                                continue stateloop;
   5674                            case '"':
   5675                                /*
   5676                                 * U+0022 QUOTATION MARK (") Parse error.
   5677                                 */
   5678                                errNoSpaceBetweenPublicAndSystemIds();
   5679                                /*
   5680                                 * Set the DOCTYPE token's system identifier to
   5681                                 * the empty string (not missing),
   5682                                 */
   5683                                clearStrBufBeforeUse();
   5684                                /*
   5685                                 * then switch to the DOCTYPE system identifier
   5686                                 * (double-quoted) state.
   5687                                 */
   5688                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
   5689                                continue stateloop;
   5690                            case '\'':
   5691                                /*
   5692                                 * U+0027 APOSTROPHE (') Parse error.
   5693                                 */
   5694                                errNoSpaceBetweenPublicAndSystemIds();
   5695                                /*
   5696                                 * Set the DOCTYPE token's system identifier to
   5697                                 * the empty string (not missing),
   5698                                 */
   5699                                clearStrBufBeforeUse();
   5700                                /*
   5701                                 * then switch to the DOCTYPE system identifier
   5702                                 * (single-quoted) state.
   5703                                 */
   5704                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
   5705                                continue stateloop;
   5706                            default:
   5707                                bogusDoctype();
   5708                                /*
   5709                                 * Set the DOCTYPE token's force-quirks flag to
   5710                                 * on.
   5711                                 */
   5712                                // done by bogusDoctype();
   5713                                /*
   5714                                 * Switch to the bogus DOCTYPE state.
   5715                                 */
   5716                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
   5717                                continue stateloop;
   5718                        }
   5719                    }
   5720                    // CPPONLY: MOZ_FALLTHROUGH;
   5721                case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
   5722                    betweendoctypepublicandsystemidentifiersloop: for (;;) {
   5723                        if (++pos == endPos) {
   5724                            break stateloop;
   5725                        }
   5726                        c = checkChar(buf, pos);
   5727                        /*
   5728                         * Consume the next input character:
   5729                         */
   5730                        switch (c) {
   5731                            case '\r':
   5732                                silentCarriageReturn();
   5733                                break stateloop;
   5734                            case '\n':
   5735                                silentLineFeed();
   5736                                // CPPONLY: MOZ_FALLTHROUGH;
   5737                            case ' ':
   5738                            case '\t':
   5739                            case '\u000C':
   5740                                /*
   5741                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   5742                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
   5743                                 * in the between DOCTYPE public and system
   5744                                 * identifiers state.
   5745                                 */
   5746                                continue;
   5747                            case '>':
   5748                                /*
   5749                                 * U+003E GREATER-THAN SIGN (>) Emit the current
   5750                                 * DOCTYPE token.
   5751                                 */
   5752                                emitDoctypeToken(pos);
   5753                                /*
   5754                                 * Switch to the data state.
   5755                                 */
   5756                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   5757                                if (shouldSuspend) {
   5758                                    break stateloop;
   5759                                }
   5760                                continue stateloop;
   5761                            case '"':
   5762                                /*
   5763                                 * U+0022 QUOTATION MARK (") Set the DOCTYPE
   5764                                 * token's system identifier to the empty string
   5765                                 * (not missing),
   5766                                 */
   5767                                clearStrBufBeforeUse();
   5768                                /*
   5769                                 * then switch to the DOCTYPE system identifier
   5770                                 * (double-quoted) state.
   5771                                 */
   5772                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
   5773                                // `break` optimizes; `continue stateloop;` would be valid
   5774                                break betweendoctypepublicandsystemidentifiersloop;
   5775                            case '\'':
   5776                                /*
   5777                                 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
   5778                                 * system identifier to the empty string (not
   5779                                 * missing),
   5780                                 */
   5781                                clearStrBufBeforeUse();
   5782                                /*
   5783                                 * then switch to the DOCTYPE system identifier
   5784                                 * (single-quoted) state.
   5785                                 */
   5786                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
   5787                                continue stateloop;
   5788                            default:
   5789                                bogusDoctype();
   5790                                /*
   5791                                 * Set the DOCTYPE token's force-quirks flag to
   5792                                 * on.
   5793                                 */
   5794                                // done by bogusDoctype();
   5795                                /*
   5796                                 * Switch to the bogus DOCTYPE state.
   5797                                 */
   5798                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
   5799                                continue stateloop;
   5800                        }
   5801                    }
   5802                    // CPPONLY: MOZ_FALLTHROUGH;
   5803                case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
   5804                    doctypesystemidentifierdoublequotedloop: for (;;) {
   5805                        if (++pos == endPos) {
   5806                            break stateloop;
   5807                        }
   5808                        c = checkChar(buf, pos);
   5809                        /*
   5810                         * Consume the next input character:
   5811                         */
   5812                        switch (c) {
   5813                            case '"':
   5814                                /*
   5815                                 * U+0022 QUOTATION MARK (") Switch to the after
   5816                                 * DOCTYPE system identifier state.
   5817                                 */
   5818                                systemIdentifier = strBufToString();
   5819                                state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
   5820                                // `break` optimizes; `continue stateloop;` would be valid
   5821                                break doctypesystemidentifierdoublequotedloop;
   5822                            case '>':
   5823                                /*
   5824                                 * U+003E GREATER-THAN SIGN (>) Parse error.
   5825                                 */
   5826                                errGtInSystemId();
   5827                                /*
   5828                                 * Set the DOCTYPE token's force-quirks flag to
   5829                                 * on.
   5830                                 */
   5831                                forceQuirks = true;
   5832                                /*
   5833                                 * Emit that DOCTYPE token.
   5834                                 */
   5835                                systemIdentifier = strBufToString();
   5836                                emitDoctypeToken(pos);
   5837                                /*
   5838                                 * Switch to the data state.
   5839                                 */
   5840                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   5841                                if (shouldSuspend) {
   5842                                    break stateloop;
   5843                                }
   5844                                continue stateloop;
   5845                            case '\r':
   5846                                appendStrBufCarriageReturn();
   5847                                break stateloop;
   5848                            case '\n':
   5849                                appendStrBufLineFeed();
   5850                                continue;
   5851                            case '\u0000':
   5852                                c = '\uFFFD';
   5853                                // CPPONLY: MOZ_FALLTHROUGH;
   5854                            default:
   5855                                /*
   5856                                 * Anything else Append the current input
   5857                                 * character to the current DOCTYPE token's
   5858                                 * system identifier.
   5859                                 */
   5860                                appendStrBuf(c);
   5861                                /*
   5862                                 * Stay in the DOCTYPE system identifier
   5863                                 * (double-quoted) state.
   5864                                 */
   5865                                continue;
   5866                        }
   5867                    }
   5868                    // CPPONLY: MOZ_FALLTHROUGH;
   5869                case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
   5870                    afterdoctypesystemidentifierloop: for (;;) {
   5871                        if (++pos == endPos) {
   5872                            break stateloop;
   5873                        }
   5874                        c = checkChar(buf, pos);
   5875                        /*
   5876                         * Consume the next input character:
   5877                         */
   5878                        switch (c) {
   5879                            case '\r':
   5880                                silentCarriageReturn();
   5881                                break stateloop;
   5882                            case '\n':
   5883                                silentLineFeed();
   5884                                // CPPONLY: MOZ_FALLTHROUGH;
   5885                            case ' ':
   5886                            case '\t':
   5887                            case '\u000C':
   5888                                /*
   5889                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   5890                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
   5891                                 * in the after DOCTYPE system identifier state.
   5892                                 */
   5893                                continue;
   5894                            case '>':
   5895                                /*
   5896                                 * U+003E GREATER-THAN SIGN (>) Emit the current
   5897                                 * DOCTYPE token.
   5898                                 */
   5899                                emitDoctypeToken(pos);
   5900                                /*
   5901                                 * Switch to the data state.
   5902                                 */
   5903                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   5904                                if (shouldSuspend) {
   5905                                    break stateloop;
   5906                                }
   5907                                continue stateloop;
   5908                            default:
   5909                                /*
   5910                                 * Switch to the bogus DOCTYPE state. (This does
   5911                                 * not set the DOCTYPE token's force-quirks flag
   5912                                 * to on.)
   5913                                 */
   5914                                bogusDoctypeWithoutQuirks();
   5915                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
   5916                                // `break` optimizes; `continue stateloop;` would be valid
   5917                                break afterdoctypesystemidentifierloop;
   5918                        }
   5919                    }
   5920                    // CPPONLY: MOZ_FALLTHROUGH;
   5921                case BOGUS_DOCTYPE:
   5922                    for (;;) {
   5923                        if (reconsume) {
   5924                            reconsume = false;
   5925                        } else {
   5926                            if (++pos == endPos) {
   5927                                break stateloop;
   5928                            }
   5929                            c = checkChar(buf, pos);
   5930                        }
   5931                        /*
   5932                         * Consume the next input character:
   5933                         */
   5934                        switch (c) {
   5935                            case '>':
   5936                                /*
   5937                                 * U+003E GREATER-THAN SIGN (>) Emit that
   5938                                 * DOCTYPE token.
   5939                                 */
   5940                                emitDoctypeToken(pos);
   5941                                /*
   5942                                 * Switch to the data state.
   5943                                 */
   5944                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   5945                                if (shouldSuspend) {
   5946                                    break stateloop;
   5947                                }
   5948                                continue stateloop;
   5949                            case '\r':
   5950                                silentCarriageReturn();
   5951                                break stateloop;
   5952                            case '\n':
   5953                                silentLineFeed();
   5954                                // CPPONLY: MOZ_FALLTHROUGH;
   5955                            default:
   5956                                /*
   5957                                 * Anything else Stay in the bogus DOCTYPE
   5958                                 * state.
   5959                                 */
   5960                                continue;
   5961                        }
   5962                    }
   5963                    // no fallthrough, reordering opportunity
   5964                case DOCTYPE_YSTEM:
   5965                    doctypeystemloop: for (;;) {
   5966                        if (++pos == endPos) {
   5967                            break stateloop;
   5968                        }
   5969                        c = checkChar(buf, pos);
   5970                        /*
   5971                         * Otherwise, if the six characters starting from the
   5972                         * current input character are an ASCII case-insensitive
   5973                         * match for the word "SYSTEM", then consume those
   5974                         * characters and switch to the before DOCTYPE system
   5975                         * identifier state.
   5976                         */
   5977                        if (index < 5) { // YSTEM.length
   5978                            char folded = c;
   5979                            if (c >= 'A' && c <= 'Z') {
   5980                                folded += 0x20;
   5981                            }
   5982                            if (folded != Tokenizer.YSTEM[index]) {
   5983                                bogusDoctype();
   5984                                reconsume = true;
   5985                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
   5986                                continue stateloop;
   5987                            }
   5988                            index++;
   5989                            continue stateloop;
   5990                        } else {
   5991                            reconsume = true;
   5992                            state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
   5993                            // `break` optimizes; `continue stateloop;` would be valid
   5994                            break doctypeystemloop;
   5995                        }
   5996                    }
   5997                    // CPPONLY: MOZ_FALLTHROUGH;
   5998                case AFTER_DOCTYPE_SYSTEM_KEYWORD:
   5999                    afterdoctypesystemkeywordloop: for (;;) {
   6000                        if (reconsume) {
   6001                            reconsume = false;
   6002                        } else {
   6003                            if (++pos == endPos) {
   6004                                break stateloop;
   6005                            }
   6006                            c = checkChar(buf, pos);
   6007                        }
   6008                        /*
   6009                         * Consume the next input character:
   6010                         */
   6011                        switch (c) {
   6012                            case '\r':
   6013                                silentCarriageReturn();
   6014                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
   6015                                break stateloop;
   6016                            case '\n':
   6017                                silentLineFeed();
   6018                                // CPPONLY: MOZ_FALLTHROUGH;
   6019                            case ' ':
   6020                            case '\t':
   6021                            case '\u000C':
   6022                                /*
   6023                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   6024                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
   6025                                 * Switch to the before DOCTYPE public
   6026                                 * identifier state.
   6027                                 */
   6028                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
   6029                                // `break` optimizes; `continue stateloop;` would be valid
   6030                                break afterdoctypesystemkeywordloop;
   6031                            case '"':
   6032                                /*
   6033                                 * U+0022 QUOTATION MARK (") Parse Error.
   6034                                 */
   6035                                errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
   6036                                /*
   6037                                 * Set the DOCTYPE token's system identifier to
   6038                                 * the empty string (not missing),
   6039                                 */
   6040                                clearStrBufBeforeUse();
   6041                                /*
   6042                                 * then switch to the DOCTYPE public identifier
   6043                                 * (double-quoted) state.
   6044                                 */
   6045                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
   6046                                continue stateloop;
   6047                            case '\'':
   6048                                /*
   6049                                 * U+0027 APOSTROPHE (') Parse Error.
   6050                                 */
   6051                                errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
   6052                                /*
   6053                                 * Set the DOCTYPE token's public identifier to
   6054                                 * the empty string (not missing),
   6055                                 */
   6056                                clearStrBufBeforeUse();
   6057                                /*
   6058                                 * then switch to the DOCTYPE public identifier
   6059                                 * (single-quoted) state.
   6060                                 */
   6061                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
   6062                                continue stateloop;
   6063                            case '>':
   6064                                /* U+003E GREATER-THAN SIGN (>) Parse error. */
   6065                                errExpectedPublicId();
   6066                                /*
   6067                                 * Set the DOCTYPE token's force-quirks flag to
   6068                                 * on.
   6069                                 */
   6070                                forceQuirks = true;
   6071                                /*
   6072                                 * Emit that DOCTYPE token.
   6073                                 */
   6074                                emitDoctypeToken(pos);
   6075                                /*
   6076                                 * Switch to the data state.
   6077                                 */
   6078                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   6079                                if (shouldSuspend) {
   6080                                    break stateloop;
   6081                                }
   6082                                continue stateloop;
   6083                            default:
   6084                                bogusDoctype();
   6085                                /*
   6086                                 * Set the DOCTYPE token's force-quirks flag to
   6087                                 * on.
   6088                                 */
   6089                                // done by bogusDoctype();
   6090                                /*
   6091                                 * Switch to the bogus DOCTYPE state.
   6092                                 */
   6093                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
   6094                                continue stateloop;
   6095                        }
   6096                    }
   6097                    // CPPONLY: MOZ_FALLTHROUGH;
   6098                case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
   6099                    beforedoctypesystemidentifierloop: for (;;) {
   6100                        if (++pos == endPos) {
   6101                            break stateloop;
   6102                        }
   6103                        c = checkChar(buf, pos);
   6104                        /*
   6105                         * Consume the next input character:
   6106                         */
   6107                        switch (c) {
   6108                            case '\r':
   6109                                silentCarriageReturn();
   6110                                break stateloop;
   6111                            case '\n':
   6112                                silentLineFeed();
   6113                                // CPPONLY: MOZ_FALLTHROUGH;
   6114                            case ' ':
   6115                            case '\t':
   6116                            case '\u000C':
   6117                                /*
   6118                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
   6119                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
   6120                                 * in the before DOCTYPE system identifier
   6121                                 * state.
   6122                                 */
   6123                                continue;
   6124                            case '"':
   6125                                /*
   6126                                 * U+0022 QUOTATION MARK (") Set the DOCTYPE
   6127                                 * token's system identifier to the empty string
   6128                                 * (not missing),
   6129                                 */
   6130                                clearStrBufBeforeUse();
   6131                                /*
   6132                                 * then switch to the DOCTYPE system identifier
   6133                                 * (double-quoted) state.
   6134                                 */
   6135                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
   6136                                continue stateloop;
   6137                            case '\'':
   6138                                /*
   6139                                 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
   6140                                 * system identifier to the empty string (not
   6141                                 * missing),
   6142                                 */
   6143                                clearStrBufBeforeUse();
   6144                                /*
   6145                                 * then switch to the DOCTYPE system identifier
   6146                                 * (single-quoted) state.
   6147                                 */
   6148                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
   6149                                // `break` optimizes; `continue stateloop;` would be valid
   6150                                break beforedoctypesystemidentifierloop;
   6151                            case '>':
   6152                                /* U+003E GREATER-THAN SIGN (>) Parse error. */
   6153                                errExpectedSystemId();
   6154                                /*
   6155                                 * Set the DOCTYPE token's force-quirks flag to
   6156                                 * on.
   6157                                 */
   6158                                forceQuirks = true;
   6159                                /*
   6160                                 * Emit that DOCTYPE token.
   6161                                 */
   6162                                emitDoctypeToken(pos);
   6163                                /*
   6164                                 * Switch to the data state.
   6165                                 */
   6166                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   6167                                if (shouldSuspend) {
   6168                                    break stateloop;
   6169                                }
   6170                                continue stateloop;
   6171                            default:
   6172                                bogusDoctype();
   6173                                /*
   6174                                 * Set the DOCTYPE token's force-quirks flag to
   6175                                 * on.
   6176                                 */
   6177                                // done by bogusDoctype();
   6178                                /*
   6179                                 * Switch to the bogus DOCTYPE state.
   6180                                 */
   6181                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
   6182                                continue stateloop;
   6183                        }
   6184                    }
   6185                    // CPPONLY: MOZ_FALLTHROUGH;
   6186                case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
   6187                    for (;;) {
   6188                        if (++pos == endPos) {
   6189                            break stateloop;
   6190                        }
   6191                        c = checkChar(buf, pos);
   6192                        /*
   6193                         * Consume the next input character:
   6194                         */
   6195                        switch (c) {
   6196                            case '\'':
   6197                                /*
   6198                                 * U+0027 APOSTROPHE (') Switch to the after
   6199                                 * DOCTYPE system identifier state.
   6200                                 */
   6201                                systemIdentifier = strBufToString();
   6202                                state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
   6203                                continue stateloop;
   6204                            case '>':
   6205                                errGtInSystemId();
   6206                                /*
   6207                                 * Set the DOCTYPE token's force-quirks flag to
   6208                                 * on.
   6209                                 */
   6210                                forceQuirks = true;
   6211                                /*
   6212                                 * Emit that DOCTYPE token.
   6213                                 */
   6214                                systemIdentifier = strBufToString();
   6215                                emitDoctypeToken(pos);
   6216                                /*
   6217                                 * Switch to the data state.
   6218                                 */
   6219                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   6220                                if (shouldSuspend) {
   6221                                    break stateloop;
   6222                                }
   6223                                continue stateloop;
   6224                            case '\r':
   6225                                appendStrBufCarriageReturn();
   6226                                break stateloop;
   6227                            case '\n':
   6228                                appendStrBufLineFeed();
   6229                                continue;
   6230                            case '\u0000':
   6231                                c = '\uFFFD';
   6232                                // CPPONLY: MOZ_FALLTHROUGH;
   6233                            default:
   6234                                /*
   6235                                 * Anything else Append the current input
   6236                                 * character to the current DOCTYPE token's
   6237                                 * system identifier.
   6238                                 */
   6239                                appendStrBuf(c);
   6240                                /*
   6241                                 * Stay in the DOCTYPE system identifier
   6242                                 * (double-quoted) state.
   6243                                 */
   6244                                continue;
   6245                        }
   6246                    }
   6247                    // no fallthrough, reordering opportunity
   6248                case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
   6249                    for (;;) {
   6250                        if (++pos == endPos) {
   6251                            break stateloop;
   6252                        }
   6253                        c = checkChar(buf, pos);
   6254                        /*
   6255                         * Consume the next input character:
   6256                         */
   6257                        switch (c) {
   6258                            case '\'':
   6259                                /*
   6260                                 * U+0027 APOSTROPHE (') Switch to the after
   6261                                 * DOCTYPE public identifier state.
   6262                                 */
   6263                                publicIdentifier = strBufToString();
   6264                                state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
   6265                                continue stateloop;
   6266                            case '>':
   6267                                errGtInPublicId();
   6268                                /*
   6269                                 * Set the DOCTYPE token's force-quirks flag to
   6270                                 * on.
   6271                                 */
   6272                                forceQuirks = true;
   6273                                /*
   6274                                 * Emit that DOCTYPE token.
   6275                                 */
   6276                                publicIdentifier = strBufToString();
   6277                                emitDoctypeToken(pos);
   6278                                /*
   6279                                 * Switch to the data state.
   6280                                 */
   6281                                state = transition(state, Tokenizer.DATA, reconsume, pos);
   6282                                if (shouldSuspend) {
   6283                                    break stateloop;
   6284                                }
   6285                                continue stateloop;
   6286                            case '\r':
   6287                                appendStrBufCarriageReturn();
   6288                                break stateloop;
   6289                            case '\n':
   6290                                appendStrBufLineFeed();
   6291                                continue;
   6292                            case '\u0000':
   6293                                c = '\uFFFD';
   6294                                // CPPONLY: MOZ_FALLTHROUGH;
   6295                            default:
   6296                                /*
   6297                                 * Anything else Append the current input
   6298                                 * character to the current DOCTYPE token's
   6299                                 * public identifier.
   6300                                 */
   6301                                appendStrBuf(c);
   6302                                /*
   6303                                 * Stay in the DOCTYPE public identifier
   6304                                 * (single-quoted) state.
   6305                                 */
   6306                                continue;
   6307                        }
   6308                    }
   6309                    // no fallthrough, reordering opportunity
   6310                case PROCESSING_INSTRUCTION:
   6311                    processinginstructionloop: for (;;) {
   6312                        if (++pos == endPos) {
   6313                            break stateloop;
   6314                        }
   6315                        c = checkChar(buf, pos);
   6316                        switch (c) {
   6317                            case '?':
   6318                                state = transition(
   6319                                        state,
   6320                                        Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
   6321                                        reconsume, pos);
   6322                                // `break` optimizes; `continue stateloop;` would be valid
   6323                                break processinginstructionloop;
   6324                            default:
   6325                                continue;
   6326                        }
   6327                    }
   6328                    // CPPONLY: MOZ_FALLTHROUGH;
   6329                case PROCESSING_INSTRUCTION_QUESTION_MARK:
   6330                    if (++pos == endPos) {
   6331                        break stateloop;
   6332                    }
   6333                    c = checkChar(buf, pos);
   6334                    switch (c) {
   6335                        case '>':
   6336                            state = transition(state, Tokenizer.DATA,
   6337                                    reconsume, pos);
   6338                            // Processing instruction syntax goes through these
   6339                            // states only in Gecko's XML View Source--not in HTML
   6340                            // parsing in Java or in Gecko.
   6341                            // Since XML View Source doesn't use the
   6342                            // suspension-after-current-token facility, its extension
   6343                            // to processing-instruction states is strictly unnecessary
   6344                            // at the moment. However, if these states ever were to be
   6345                            // used together with the suspension-after-current-token
   6346                            // facility, these states would need to participate, since
   6347                            // suspension could be requested when only less-than has been
   6348                            // seen and we don't yet know if we end up here. Handling
   6349                            // the currently-unnecessary case in order to avoid leaving
   6350                            // a trap for future modification.
   6351                            suspendIfRequestedAfterCurrentNonTextToken();
   6352                            if (shouldSuspend) {
   6353                                break stateloop;
   6354                            }
   6355                            continue stateloop;
   6356                        default:
   6357                            state = transition(state,
   6358                                    Tokenizer.PROCESSING_INSTRUCTION,
   6359                                    reconsume, pos);
   6360                            continue stateloop;
   6361                    }
   6362                    // END HOTSPOT WORKAROUND
   6363            }
   6364        }
   6365        flushChars(buf, pos);
   6366        /*
   6367         * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
   6368         */
   6369        // Save locals
   6370        stateSave = state;
   6371        returnStateSave = returnState;
   6372        return pos;
   6373    }
   6374 
   6375    // HOTSPOT WORKAROUND INSERTION POINT
   6376 
   6377    // [NOCPP[
   6378 
   6379    protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
   6380        return to;
   6381    }
   6382 
   6383    // ]NOCPP]
   6384 
   6385    private void initDoctypeFields() {
   6386        // Discard the characters "DOCTYPE" accumulated as a potential bogus
   6387        // comment into strBuf.
   6388        clearStrBufAfterUse();
   6389        doctypeName = null;
   6390        if (systemIdentifier != null) {
   6391            Portability.releaseString(systemIdentifier);
   6392            systemIdentifier = null;
   6393        }
   6394        if (publicIdentifier != null) {
   6395            Portability.releaseString(publicIdentifier);
   6396            publicIdentifier = null;
   6397        }
   6398        forceQuirks = false;
   6399    }
   6400 
   6401    @Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn()
   6402            throws SAXException {
   6403        silentCarriageReturn();
   6404        adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false);
   6405    }
   6406 
   6407    @Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed()
   6408            throws SAXException {
   6409        silentLineFeed();
   6410        adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false);
   6411    }
   6412 
   6413    @Inline private void appendStrBufLineFeed() {
   6414        silentLineFeed();
   6415        appendStrBuf('\n');
   6416    }
   6417 
   6418    @Inline private void appendStrBufCarriageReturn() {
   6419        silentCarriageReturn();
   6420        appendStrBuf('\n');
   6421    }
   6422 
   6423    // [NOCPP[
   6424 
   6425    @Inline protected void silentCarriageReturn() {
   6426        ++line;
   6427        lastCR = true;
   6428    }
   6429 
   6430    @Inline protected void silentLineFeed() {
   6431        ++line;
   6432    }
   6433 
   6434    // ]NOCPP]
   6435 
   6436    @Inline private void emitCarriageReturn(@NoLength char[] buf, int pos)
   6437            throws SAXException {
   6438        silentCarriageReturn();
   6439        flushChars(buf, pos);
   6440        tokenHandler.characters(Tokenizer.LF, 0, 1);
   6441        cstart = Integer.MAX_VALUE;
   6442    }
   6443 
   6444    private void emitReplacementCharacter(@NoLength char[] buf, int pos)
   6445            throws SAXException {
   6446        flushChars(buf, pos);
   6447        tokenHandler.zeroOriginatingReplacementCharacter();
   6448        cstart = pos + 1;
   6449    }
   6450 
   6451    private void maybeEmitReplacementCharacter(@NoLength char[] buf, int pos)
   6452            throws SAXException {
   6453        flushChars(buf, pos);
   6454        tokenHandler.zeroOrReplacementCharacter();
   6455        cstart = pos + 1;
   6456    }
   6457 
   6458    private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
   6459            throws SAXException {
   6460        flushChars(buf, pos);
   6461        tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
   6462        cstart = pos + 1;
   6463    }
   6464 
   6465    @Inline private void setAdditionalAndRememberAmpersandLocation(char add) {
   6466        additional = add;
   6467        // [NOCPP[
   6468        ampersandLocation = new LocatorImpl(this);
   6469        // ]NOCPP]
   6470    }
   6471 
   6472    private void bogusDoctype() throws SAXException {
   6473        errBogusDoctype();
   6474        forceQuirks = true;
   6475    }
   6476 
   6477    private void bogusDoctypeWithoutQuirks() throws SAXException {
   6478        errBogusDoctype();
   6479        forceQuirks = false;
   6480    }
   6481 
   6482    private void handleNcrValue(int returnState) throws SAXException {
   6483        /*
   6484         * If one or more characters match the range, then take them all and
   6485         * interpret the string of characters as a number (either hexadecimal or
   6486         * decimal as appropriate).
   6487         */
   6488        if (value <= 0xFFFF) {
   6489            if (value >= 0x80 && value <= 0x9f) {
   6490                /*
   6491                 * If that number is one of the numbers in the first column of
   6492                 * the following table, then this is a parse error.
   6493                 */
   6494                errNcrInC1Range();
   6495                /*
   6496                 * Find the row with that number in the first column, and return
   6497                 * a character token for the Unicode character given in the
   6498                 * second column of that row.
   6499                 */
   6500                @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
   6501                emitOrAppendOne(val, returnState);
   6502                // [NOCPP[
   6503            } else if (value == 0xC
   6504                    && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
   6505                if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
   6506                    emitOrAppendOne(Tokenizer.SPACE, returnState);
   6507                } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
   6508                    fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
   6509                }
   6510                // ]NOCPP]
   6511            } else if (value == 0x0) {
   6512                errNcrZero();
   6513                emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
   6514            } else if ((value & 0xF800) == 0xD800) {
   6515                errNcrSurrogate();
   6516                emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
   6517            } else {
   6518                /*
   6519                 * Otherwise, return a character token for the Unicode character
   6520                 * whose code point is that number.
   6521                 */
   6522                char ch = (char) value;
   6523                // [NOCPP[
   6524                if (value == 0x0D) {
   6525                    errNcrCr();
   6526                } else if ((value <= 0x0008) || (value == 0x000B)
   6527                        || (value >= 0x000E && value <= 0x001F)) {
   6528                    ch = errNcrControlChar(ch);
   6529                } else if (value >= 0xFDD0 && value <= 0xFDEF) {
   6530                    errNcrUnassigned();
   6531                } else if ((value & 0xFFFE) == 0xFFFE) {
   6532                    ch = errNcrNonCharacter(ch);
   6533                } else if (value >= 0x007F && value <= 0x009F) {
   6534                    errNcrControlChar();
   6535                } else {
   6536                    maybeWarnPrivateUse(ch);
   6537                }
   6538                // ]NOCPP]
   6539                bmpChar[0] = ch;
   6540                emitOrAppendOne(bmpChar, returnState);
   6541            }
   6542        } else if (value <= 0x10FFFF) {
   6543            // [NOCPP[
   6544            maybeWarnPrivateUseAstral();
   6545            if ((value & 0xFFFE) == 0xFFFE) {
   6546                errAstralNonCharacter(value);
   6547            }
   6548            // ]NOCPP]
   6549            astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
   6550            astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
   6551            emitOrAppendTwo(astralChar, returnState);
   6552        } else {
   6553            errNcrOutOfRange();
   6554            emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
   6555        }
   6556    }
   6557 
   6558    public void eof() throws SAXException {
   6559        int state = stateSave;
   6560        int returnState = returnStateSave;
   6561 
   6562        eofloop: for (;;) {
   6563            switch (state) {
   6564                case SCRIPT_DATA_LESS_THAN_SIGN:
   6565                case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
   6566                    /*
   6567                     * Otherwise, emit a U+003C LESS-THAN SIGN character token
   6568                     */
   6569                    tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
   6570                    /*
   6571                     * and reconsume the current input character in the data
   6572                     * state.
   6573                     */
   6574                    break eofloop;
   6575                case TAG_OPEN:
   6576                    /*
   6577                     * The behavior of this state depends on the content model
   6578                     * flag.
   6579                     */
   6580                    /*
   6581                     * Anything else Parse error.
   6582                     */
   6583                    errEofAfterLt();
   6584                    /*
   6585                     * Emit a U+003C LESS-THAN SIGN character token
   6586                     */
   6587                    tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
   6588                    /*
   6589                     * and reconsume the current input character in the data
   6590                     * state.
   6591                     */
   6592                    break eofloop;
   6593                case RAWTEXT_RCDATA_LESS_THAN_SIGN:
   6594                    /*
   6595                     * Emit a U+003C LESS-THAN SIGN character token
   6596                     */
   6597                    tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
   6598                    /*
   6599                     * and reconsume the current input character in the RCDATA
   6600                     * state.
   6601                     */
   6602                    break eofloop;
   6603                case NON_DATA_END_TAG_NAME:
   6604                    /*
   6605                     * Emit a U+003C LESS-THAN SIGN character token, a U+002F
   6606                     * SOLIDUS character token,
   6607                     */
   6608                    tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
   6609                    /*
   6610                     * a character token for each of the characters in the
   6611                     * temporary buffer (in the order they were added to the
   6612                     * buffer),
   6613                     */
   6614                    emitStrBuf();
   6615                    /*
   6616                     * and reconsume the current input character in the RCDATA
   6617                     * state.
   6618                     */
   6619                    break eofloop;
   6620                case CLOSE_TAG_OPEN:
   6621                    /* EOF Parse error. */
   6622                    errEofAfterLt();
   6623                    /*
   6624                     * Emit a U+003C LESS-THAN SIGN character token and a U+002F
   6625                     * SOLIDUS character token.
   6626                     */
   6627                    tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
   6628                    /*
   6629                     * Reconsume the EOF character in the data state.
   6630                     */
   6631                    break eofloop;
   6632                case TAG_NAME:
   6633                    /*
   6634                     * EOF Parse error.
   6635                     */
   6636                    errEofInTagName();
   6637                    /*
   6638                     * Reconsume the EOF character in the data state.
   6639                     */
   6640                    break eofloop;
   6641                case BEFORE_ATTRIBUTE_NAME:
   6642                case AFTER_ATTRIBUTE_VALUE_QUOTED:
   6643                case SELF_CLOSING_START_TAG:
   6644                    /* EOF Parse error. */
   6645                    errEofWithoutGt();
   6646                    /*
   6647                     * Reconsume the EOF character in the data state.
   6648                     */
   6649                    break eofloop;
   6650                case ATTRIBUTE_NAME:
   6651                    /*
   6652                     * EOF Parse error.
   6653                     */
   6654                    errEofInAttributeName();
   6655                    /*
   6656                     * Reconsume the EOF character in the data state.
   6657                     */
   6658                    break eofloop;
   6659                case AFTER_ATTRIBUTE_NAME:
   6660                case BEFORE_ATTRIBUTE_VALUE:
   6661                    /* EOF Parse error. */
   6662                    errEofWithoutGt();
   6663                    /*
   6664                     * Reconsume the EOF character in the data state.
   6665                     */
   6666                    break eofloop;
   6667                case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
   6668                case ATTRIBUTE_VALUE_SINGLE_QUOTED:
   6669                case ATTRIBUTE_VALUE_UNQUOTED:
   6670                    /* EOF Parse error. */
   6671                    errEofInAttributeValue();
   6672                    /*
   6673                     * Reconsume the EOF character in the data state.
   6674                     */
   6675                    break eofloop;
   6676                case BOGUS_COMMENT:
   6677                    emitComment(0, 0);
   6678                    break eofloop;
   6679                case BOGUS_COMMENT_HYPHEN:
   6680                    // [NOCPP[
   6681                    maybeAppendSpaceToBogusComment();
   6682                    // ]NOCPP]
   6683                    emitComment(0, 0);
   6684                    break eofloop;
   6685                case MARKUP_DECLARATION_OPEN:
   6686                    errBogusComment();
   6687                    emitComment(0, 0);
   6688                    break eofloop;
   6689                case MARKUP_DECLARATION_HYPHEN:
   6690                    errBogusComment();
   6691                    emitComment(0, 0);
   6692                    break eofloop;
   6693                case MARKUP_DECLARATION_OCTYPE:
   6694                    if (index < 6) {
   6695                        errBogusComment();
   6696                        emitComment(0, 0);
   6697                    } else {
   6698                        /* EOF Parse error. */
   6699                        errEofInDoctype();
   6700                        /*
   6701                         * Create a new DOCTYPE token. Set its force-quirks flag
   6702                         * to on.
   6703                         */
   6704                        doctypeName = null;
   6705                        if (systemIdentifier != null) {
   6706                            Portability.releaseString(systemIdentifier);
   6707                            systemIdentifier = null;
   6708                        }
   6709                        if (publicIdentifier != null) {
   6710                            Portability.releaseString(publicIdentifier);
   6711                            publicIdentifier = null;
   6712                        }
   6713                        forceQuirks = true;
   6714                        /*
   6715                         * Emit the token.
   6716                         */
   6717                        emitDoctypeToken(0);
   6718                        /*
   6719                         * Reconsume the EOF character in the data state.
   6720                         */
   6721                        break eofloop;
   6722                    }
   6723                    break eofloop;
   6724                case COMMENT_START:
   6725                case COMMENT:
   6726                case COMMENT_LESSTHAN:
   6727                case COMMENT_LESSTHAN_BANG:
   6728                    /*
   6729                     * EOF Parse error.
   6730                     */
   6731                    errEofInComment();
   6732                    /* Emit the comment token. */
   6733                    emitComment(0, 0);
   6734                    /*
   6735                     * Reconsume the EOF character in the data state.
   6736                     */
   6737                    break eofloop;
   6738                case COMMENT_END:
   6739                case COMMENT_LESSTHAN_BANG_DASH_DASH:
   6740                    errEofInComment();
   6741                    /* Emit the comment token. */
   6742                    emitComment(2, 0);
   6743                    /*
   6744                     * Reconsume the EOF character in the data state.
   6745                     */
   6746                    break eofloop;
   6747                case COMMENT_END_DASH:
   6748                case COMMENT_START_DASH:
   6749                case COMMENT_LESSTHAN_BANG_DASH:
   6750                    errEofInComment();
   6751                    /* Emit the comment token. */
   6752                    emitComment(1, 0);
   6753                    /*
   6754                     * Reconsume the EOF character in the data state.
   6755                     */
   6756                    break eofloop;
   6757                case COMMENT_END_BANG:
   6758                    errEofInComment();
   6759                    /* Emit the comment token. */
   6760                    emitComment(3, 0);
   6761                    /*
   6762                     * Reconsume the EOF character in the data state.
   6763                     */
   6764                    break eofloop;
   6765                case DOCTYPE:
   6766                case BEFORE_DOCTYPE_NAME:
   6767                    errEofInDoctype();
   6768                    /*
   6769                     * Create a new DOCTYPE token. Set its force-quirks flag to
   6770                     * on.
   6771                     */
   6772                    forceQuirks = true;
   6773                    /*
   6774                     * Emit the token.
   6775                     */
   6776                    emitDoctypeToken(0);
   6777                    /*
   6778                     * Reconsume the EOF character in the data state.
   6779                     */
   6780                    break eofloop;
   6781                case DOCTYPE_NAME:
   6782                    errEofInDoctype();
   6783                    strBufToDoctypeName();
   6784                    /*
   6785                     * Set the DOCTYPE token's force-quirks flag to on.
   6786                     */
   6787                    forceQuirks = true;
   6788                    /*
   6789                     * Emit that DOCTYPE token.
   6790                     */
   6791                    emitDoctypeToken(0);
   6792                    /*
   6793                     * Reconsume the EOF character in the data state.
   6794                     */
   6795                    break eofloop;
   6796                case DOCTYPE_UBLIC:
   6797                case DOCTYPE_YSTEM:
   6798                case AFTER_DOCTYPE_NAME:
   6799                case AFTER_DOCTYPE_PUBLIC_KEYWORD:
   6800                case AFTER_DOCTYPE_SYSTEM_KEYWORD:
   6801                case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
   6802                    errEofInDoctype();
   6803                    /*
   6804                     * Set the DOCTYPE token's force-quirks flag to on.
   6805                     */
   6806                    forceQuirks = true;
   6807                    /*
   6808                     * Emit that DOCTYPE token.
   6809                     */
   6810                    emitDoctypeToken(0);
   6811                    /*
   6812                     * Reconsume the EOF character in the data state.
   6813                     */
   6814                    break eofloop;
   6815                case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
   6816                case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
   6817                    /* EOF Parse error. */
   6818                    errEofInPublicId();
   6819                    /*
   6820                     * Set the DOCTYPE token's force-quirks flag to on.
   6821                     */
   6822                    forceQuirks = true;
   6823                    /*
   6824                     * Emit that DOCTYPE token.
   6825                     */
   6826                    publicIdentifier = strBufToString();
   6827                    emitDoctypeToken(0);
   6828                    /*
   6829                     * Reconsume the EOF character in the data state.
   6830                     */
   6831                    break eofloop;
   6832                case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
   6833                case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
   6834                case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
   6835                    errEofInDoctype();
   6836                    /*
   6837                     * Set the DOCTYPE token's force-quirks flag to on.
   6838                     */
   6839                    forceQuirks = true;
   6840                    /*
   6841                     * Emit that DOCTYPE token.
   6842                     */
   6843                    emitDoctypeToken(0);
   6844                    /*
   6845                     * Reconsume the EOF character in the data state.
   6846                     */
   6847                    break eofloop;
   6848                case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
   6849                case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
   6850                    /* EOF Parse error. */
   6851                    errEofInSystemId();
   6852                    /*
   6853                     * Set the DOCTYPE token's force-quirks flag to on.
   6854                     */
   6855                    forceQuirks = true;
   6856                    /*
   6857                     * Emit that DOCTYPE token.
   6858                     */
   6859                    systemIdentifier = strBufToString();
   6860                    emitDoctypeToken(0);
   6861                    /*
   6862                     * Reconsume the EOF character in the data state.
   6863                     */
   6864                    break eofloop;
   6865                case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
   6866                    errEofInDoctype();
   6867                    /*
   6868                     * Set the DOCTYPE token's force-quirks flag to on.
   6869                     */
   6870                    forceQuirks = true;
   6871                    /*
   6872                     * Emit that DOCTYPE token.
   6873                     */
   6874                    emitDoctypeToken(0);
   6875                    /*
   6876                     * Reconsume the EOF character in the data state.
   6877                     */
   6878                    break eofloop;
   6879                case BOGUS_DOCTYPE:
   6880                    /*
   6881                     * Emit that DOCTYPE token.
   6882                     */
   6883                    emitDoctypeToken(0);
   6884                    /*
   6885                     * Reconsume the EOF character in the data state.
   6886                     */
   6887                    break eofloop;
   6888                case CONSUME_CHARACTER_REFERENCE:
   6889                    /*
   6890                     * Unlike the definition is the spec, this state does not
   6891                     * return a value and never requires the caller to
   6892                     * backtrack. This state takes care of emitting characters
   6893                     * or appending to the current attribute value. It also
   6894                     * takes care of that in the case when consuming the entity
   6895                     * fails.
   6896                     */
   6897                    /*
   6898                     * This section defines how to consume an entity. This
   6899                     * definition is used when parsing entities in text and in
   6900                     * attributes.
   6901                     *
   6902                     * The behavior depends on the identity of the next
   6903                     * character (the one immediately after the U+0026 AMPERSAND
   6904                     * character):
   6905                     */
   6906 
   6907                    emitOrAppendCharRefBuf(returnState);
   6908                    state = returnState;
   6909                    continue;
   6910                case CHARACTER_REFERENCE_HILO_LOOKUP:
   6911                    emitOrAppendCharRefBuf(returnState);
   6912                    state = returnState;
   6913                    continue;
   6914                case CHARACTER_REFERENCE_TAIL:
   6915                    outer: for (;;) {
   6916                        char c = '\u0000';
   6917                        entCol++;
   6918                        /*
   6919                         * Consume the maximum number of characters possible,
   6920                         * with the consumed characters matching one of the
   6921                         * identifiers in the first column of the named
   6922                         * character references table (in a case-sensitive
   6923                         * manner).
   6924                         */
   6925                        hiloop: for (;;) {
   6926                            if (hi == -1) {
   6927                                break hiloop;
   6928                            }
   6929                            if (entCol == NamedCharacters.NAMES[hi].length()) {
   6930                                break hiloop;
   6931                            }
   6932                            if (entCol > NamedCharacters.NAMES[hi].length()) {
   6933                                break outer;
   6934                            } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
   6935                                hi--;
   6936                            } else {
   6937                                break hiloop;
   6938                            }
   6939                        }
   6940 
   6941                        loloop: for (;;) {
   6942                            if (hi < lo) {
   6943                                break outer;
   6944                            }
   6945                            if (entCol == NamedCharacters.NAMES[lo].length()) {
   6946                                candidate = lo;
   6947                                charRefBufMark = charRefBufLen;
   6948                                lo++;
   6949                            } else if (entCol > NamedCharacters.NAMES[lo].length()) {
   6950                                break outer;
   6951                            } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
   6952                                lo++;
   6953                            } else {
   6954                                break loloop;
   6955                            }
   6956                        }
   6957                        if (hi < lo) {
   6958                            break outer;
   6959                        }
   6960                        continue;
   6961                    }
   6962 
   6963                    if (candidate == -1) {
   6964                        emitOrAppendCharRefBuf(returnState);
   6965                        state = returnState;
   6966                        continue eofloop;
   6967                    } else {
   6968                        @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
   6969                        if (candidateName.length() == 0
   6970                                || candidateName.charAt(candidateName.length() - 1) != ';') {
   6971                            /*
   6972                             * If the last character matched is not a U+003B
   6973                             * SEMICOLON (;), there is a parse error.
   6974                             */
   6975                            if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
   6976                                /*
   6977                                 * If the entity is being consumed as part of an
   6978                                 * attribute, and the last character matched is
   6979                                 * not a U+003B SEMICOLON (;),
   6980                                 */
   6981                                char ch;
   6982                                if (charRefBufMark == charRefBufLen) {
   6983                                    ch = '\u0000';
   6984                                } else {
   6985                                    ch = charRefBuf[charRefBufMark];
   6986                                }
   6987                                if ((ch >= '0' && ch <= '9')
   6988                                        || (ch >= 'A' && ch <= 'Z')
   6989                                        || (ch >= 'a' && ch <= 'z')) {
   6990                                    /*
   6991                                     * and the next character is in the range
   6992                                     * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
   6993                                     * U+0041 LATIN CAPITAL LETTER A to U+005A
   6994                                     * LATIN CAPITAL LETTER Z, or U+0061 LATIN
   6995                                     * SMALL LETTER A to U+007A LATIN SMALL
   6996                                     * LETTER Z, then, for historical reasons,
   6997                                     * all the characters that were matched
   6998                                     * after the U+0026 AMPERSAND (&) must be
   6999                                     * unconsumed, and nothing is returned.
   7000                                     */
   7001                                    appendCharRefBufToStrBuf();
   7002                                    state = returnState;
   7003                                    continue eofloop;
   7004                                }
   7005                            }
   7006                            if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
   7007                                errUnescapedAmpersandInterpretedAsCharacterReference();
   7008                            } else {
   7009                                errNotSemicolonTerminated();
   7010                            }
   7011                        }
   7012 
   7013                        /*
   7014                         * Otherwise, return a character token for the character
   7015                         * corresponding to the entity name (as given by the
   7016                         * second column of the named character references
   7017                         * table).
   7018                         */
   7019                        @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
   7020                        if (
   7021                        // [NOCPP[
   7022                        val.length == 1
   7023                        // ]NOCPP]
   7024                        // CPPONLY: val[1] == 0
   7025                        ) {
   7026                            emitOrAppendOne(val, returnState);
   7027                        } else {
   7028                            emitOrAppendTwo(val, returnState);
   7029                        }
   7030                        // this is so complicated!
   7031                        if (charRefBufMark < charRefBufLen) {
   7032                            if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
   7033                                appendStrBuf(charRefBuf, charRefBufMark,
   7034                                        charRefBufLen - charRefBufMark);
   7035                            } else {
   7036                                tokenHandler.characters(charRefBuf, charRefBufMark,
   7037                                        charRefBufLen - charRefBufMark);
   7038                            }
   7039                        }
   7040                        charRefBufLen = 0;
   7041                        state = returnState;
   7042                        continue eofloop;
   7043                        /*
   7044                         * If the markup contains I'm &notit; I tell you, the
   7045                         * entity is parsed as "not", as in, I'm ¬it; I tell
   7046                         * you. But if the markup was I'm &notin; I tell you,
   7047                         * the entity would be parsed as "notin;", resulting in
   7048                         * I'm ∉ I tell you.
   7049                         */
   7050                    }
   7051                case CONSUME_NCR:
   7052                case DECIMAL_NRC_LOOP:
   7053                case HEX_NCR_LOOP:
   7054                    /*
   7055                     * If no characters match the range, then don't consume any
   7056                     * characters (and unconsume the U+0023 NUMBER SIGN
   7057                     * character and, if appropriate, the X character). This is
   7058                     * a parse error; nothing is returned.
   7059                     *
   7060                     * Otherwise, if the next character is a U+003B SEMICOLON,
   7061                     * consume that too. If it isn't, there is a parse error.
   7062                     */
   7063                    if (!seenDigits) {
   7064                        errNoDigitsInNCR();
   7065                        emitOrAppendCharRefBuf(returnState);
   7066                        state = returnState;
   7067                        continue;
   7068                    } else {
   7069                        errCharRefLacksSemicolon();
   7070                    }
   7071                    // WARNING previous state sets reconsume
   7072                    handleNcrValue(returnState);
   7073                    state = returnState;
   7074                    continue;
   7075                case CDATA_RSQB:
   7076                    tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
   7077                    break eofloop;
   7078                case CDATA_RSQB_RSQB:
   7079                    tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
   7080                    break eofloop;
   7081                case DATA:
   7082                default:
   7083                    break eofloop;
   7084            }
   7085        }
   7086        // case DATA:
   7087        /*
   7088         * EOF Emit an end-of-file token.
   7089         */
   7090        tokenHandler.eof();
   7091        return;
   7092    }
   7093 
   7094    /**
   7095     * Emits a doctype token.
   7096     *
   7097     * NOTE: The method may set <code>shouldSuspend</code>, so the caller
   7098     * must have this pattern after the state's <code>transition</code> call:
   7099     * <pre>
   7100     * if (shouldSuspend) {
   7101     *     break stateloop;
   7102     * }
   7103     * continue stateloop;
   7104     * </pre>
   7105     *
   7106     * @param pos
   7107     * @throws SAXException
   7108     */
   7109    private void emitDoctypeToken(int pos) throws SAXException {
   7110        // CPPONLY: RememberGt(pos);
   7111        cstart = pos + 1;
   7112        tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
   7113                forceQuirks);
   7114        // It is OK and sufficient to release these here, since
   7115        // there's no way out of the doctype states than through paths
   7116        // that call this method.
   7117        doctypeName = null;
   7118        Portability.releaseString(publicIdentifier);
   7119        publicIdentifier = null;
   7120        Portability.releaseString(systemIdentifier);
   7121        systemIdentifier = null;
   7122        suspendIfRequestedAfterCurrentNonTextToken();
   7123    }
   7124 
   7125    /**
   7126     * If a previous call to <code>suspendAfterCurrentTokenIfNotInText()</code>
   7127     * happened in a non-text context, this method turns that deferred suspension
   7128     * request into an immediately-pending suspension request.
   7129     */
   7130    @Inline private void suspendIfRequestedAfterCurrentNonTextToken() {
   7131        if (suspendAfterCurrentNonTextToken) {
   7132            suspendAfterCurrentNonTextToken = false;
   7133            shouldSuspend = true;
   7134        }
   7135    }
   7136 
   7137    // Making this private until the full Java implementation is done.
   7138    /**
   7139     * Request suspension after the current token if the tokenizer is currently
   7140     * in a non-text state (i.e. it's known that the next token will be a
   7141     * non-text token).
   7142     *
   7143     * Must not be called when <code>tokenizeBuffer()</code> is on the call
   7144     * stack.
   7145     */
   7146    @SuppressWarnings("unused") private void suspendAfterCurrentTokenIfNotInText() {
   7147        switch (stateSave) {
   7148            case DATA:
   7149            case RCDATA:
   7150            case SCRIPT_DATA:
   7151            case RAWTEXT:
   7152            case SCRIPT_DATA_ESCAPED:
   7153            case PLAINTEXT:
   7154            case NON_DATA_END_TAG_NAME: // We haven't yet committed to the next
   7155                                        // token being a non-text token, though
   7156                                        // it could be.
   7157            case SCRIPT_DATA_LESS_THAN_SIGN:
   7158            case SCRIPT_DATA_ESCAPE_START:
   7159            case SCRIPT_DATA_ESCAPE_START_DASH:
   7160            case SCRIPT_DATA_ESCAPED_DASH:
   7161            case SCRIPT_DATA_ESCAPED_DASH_DASH:
   7162            case RAWTEXT_RCDATA_LESS_THAN_SIGN:
   7163            case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
   7164            case SCRIPT_DATA_DOUBLE_ESCAPE_START:
   7165            case SCRIPT_DATA_DOUBLE_ESCAPED:
   7166            case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
   7167            case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
   7168            case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
   7169            case SCRIPT_DATA_DOUBLE_ESCAPE_END:
   7170                return;
   7171            case TAG_NAME:
   7172            case BEFORE_ATTRIBUTE_NAME:
   7173            case ATTRIBUTE_NAME:
   7174            case AFTER_ATTRIBUTE_NAME:
   7175            case BEFORE_ATTRIBUTE_VALUE:
   7176            case AFTER_ATTRIBUTE_VALUE_QUOTED:
   7177            case BOGUS_COMMENT:
   7178            case MARKUP_DECLARATION_OPEN:
   7179            case DOCTYPE:
   7180            case BEFORE_DOCTYPE_NAME:
   7181            case DOCTYPE_NAME:
   7182            case AFTER_DOCTYPE_NAME:
   7183            case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
   7184            case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
   7185            case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
   7186            case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
   7187            case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
   7188            case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
   7189            case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
   7190            case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
   7191            case BOGUS_DOCTYPE:
   7192            case COMMENT_START:
   7193            case COMMENT_START_DASH:
   7194            case COMMENT:
   7195            case COMMENT_END_DASH:
   7196            case COMMENT_END:
   7197            case COMMENT_END_BANG:
   7198            case TAG_OPEN:
   7199            case CLOSE_TAG_OPEN:
   7200            case MARKUP_DECLARATION_HYPHEN:
   7201            case MARKUP_DECLARATION_OCTYPE:
   7202            case DOCTYPE_UBLIC:
   7203            case DOCTYPE_YSTEM:
   7204            case AFTER_DOCTYPE_PUBLIC_KEYWORD:
   7205            case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
   7206            case AFTER_DOCTYPE_SYSTEM_KEYWORD:
   7207            case SELF_CLOSING_START_TAG:
   7208            case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
   7209            case ATTRIBUTE_VALUE_SINGLE_QUOTED:
   7210            case ATTRIBUTE_VALUE_UNQUOTED:
   7211            case BOGUS_COMMENT_HYPHEN:
   7212            case COMMENT_LESSTHAN:
   7213            case COMMENT_LESSTHAN_BANG:
   7214            case COMMENT_LESSTHAN_BANG_DASH:
   7215            case COMMENT_LESSTHAN_BANG_DASH_DASH:
   7216            case CDATA_START:
   7217            case CDATA_SECTION:
   7218            case CDATA_RSQB:
   7219            case CDATA_RSQB_RSQB:
   7220            case PROCESSING_INSTRUCTION:
   7221            case PROCESSING_INSTRUCTION_QUESTION_MARK:
   7222                break;
   7223            case CONSUME_CHARACTER_REFERENCE:
   7224            case CONSUME_NCR:
   7225            case CHARACTER_REFERENCE_TAIL:
   7226            case HEX_NCR_LOOP:
   7227            case DECIMAL_NRC_LOOP:
   7228            case HANDLE_NCR_VALUE:
   7229            case HANDLE_NCR_VALUE_RECONSUME:
   7230            case CHARACTER_REFERENCE_HILO_LOOKUP:
   7231                if (returnStateSave == DATA || returnStateSave == RCDATA) {
   7232                    return;
   7233                }
   7234                break;
   7235            default:
   7236                assert false : "Incomplete switch";
   7237                return;
   7238        }
   7239        suspendAfterCurrentNonTextToken = true;
   7240    }
   7241 
   7242    // Making this private until the full Java implementation is done.
   7243    /**
   7244     * Queries if we are about to suspend after the current non-text token due to a request
   7245     * from <code>suspendAfterCurrentTokenIfNotInText()</code>.
   7246     * @return <code>true</code> iff <code>suspendAfterCurrentTokenIfNotInText()</code> was
   7247     * called in a non-text position and the then-current token has not been emitted yet.
   7248     */
   7249    @SuppressWarnings("unused") private boolean suspensionAfterCurrentNonTextTokenPending() {
   7250        return suspendAfterCurrentNonTextToken;
   7251    }
   7252 
   7253    // [NOCPP[
   7254 
   7255    @Inline protected char checkChar(@NoLength char[] buf, int pos)
   7256            throws SAXException {
   7257        return buf[pos];
   7258    }
   7259 
   7260    // ]NOCPP]
   7261 
   7262    public boolean internalEncodingDeclaration(String internalCharset)
   7263            throws SAXException {
   7264        if (encodingDeclarationHandler != null) {
   7265            return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
   7266        }
   7267        return false;
   7268    }
   7269 
   7270    /**
   7271     * @param val
   7272     * @throws SAXException
   7273     */
   7274    @Inline private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
   7275            throws SAXException {
   7276        if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
   7277            appendStrBuf(val[0]);
   7278            appendStrBuf(val[1]);
   7279        } else {
   7280            tokenHandler.characters(val, 0, 2);
   7281        }
   7282    }
   7283 
   7284    @Inline private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
   7285            throws SAXException {
   7286        if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
   7287            appendStrBuf(val[0]);
   7288        } else {
   7289            tokenHandler.characters(val, 0, 1);
   7290        }
   7291    }
   7292 
   7293    public void end() throws SAXException {
   7294        if (!keepBuffer) {
   7295            strBuf = null;
   7296        }
   7297        doctypeName = null;
   7298        if (systemIdentifier != null) {
   7299            Portability.releaseString(systemIdentifier);
   7300            systemIdentifier = null;
   7301        }
   7302        if (publicIdentifier != null) {
   7303            Portability.releaseString(publicIdentifier);
   7304            publicIdentifier = null;
   7305        }
   7306        tagName = null;
   7307        nonInternedTagName.setNameForNonInterned(null
   7308                // CPPONLY: , false
   7309                );
   7310        attributeName = null;
   7311        // CPPONLY: nonInternedAttributeName.setNameForNonInterned(null);
   7312        tokenHandler.endTokenization();
   7313        if (attributes != null) {
   7314            // [NOCPP[
   7315            attributes = null;
   7316            // ]NOCPP]
   7317            // CPPONLY: attributes.clear(mappingLangToXmlLang);
   7318        }
   7319    }
   7320 
   7321    @Inline public void requestSuspension() {
   7322        shouldSuspend = true;
   7323    }
   7324 
   7325    // [NOCPP[
   7326 
   7327    public void becomeConfident() {
   7328        confident = true;
   7329    }
   7330 
   7331    /**
   7332     * Returns the nextCharOnNewLine.
   7333     *
   7334     * @return the nextCharOnNewLine
   7335     */
   7336    public boolean isNextCharOnNewLine() {
   7337        return false;
   7338    }
   7339 
   7340    public boolean isPrevCR() {
   7341        return lastCR;
   7342    }
   7343 
   7344    /**
   7345     * Returns the line.
   7346     *
   7347     * @return the line
   7348     */
   7349    public int getLine() {
   7350        return -1;
   7351    }
   7352 
   7353    /**
   7354     * Returns the col.
   7355     *
   7356     * @return the col
   7357     */
   7358    public int getCol() {
   7359        return -1;
   7360    }
   7361 
   7362    // ]NOCPP]
   7363 
   7364    @Inline public boolean isInDataState() {
   7365        return (stateSave == DATA);
   7366    }
   7367 
   7368    public void resetToDataState() {
   7369        clearStrBufAfterUse();
   7370        charRefBufLen = 0;
   7371        stateSave = Tokenizer.DATA;
   7372        // line = 1; XXX line numbers
   7373        lastCR = false;
   7374        index = 0;
   7375        forceQuirks = false;
   7376        additional = '\u0000';
   7377        entCol = -1;
   7378        firstCharKey = -1;
   7379        lo = 0;
   7380        hi = 0; // will always be overwritten before use anyway
   7381        candidate = -1;
   7382        charRefBufMark = 0;
   7383        value = 0;
   7384        seenDigits = false;
   7385        suspendAfterCurrentNonTextToken = false;
   7386        endTag = false;
   7387        shouldSuspend = false;
   7388        initDoctypeFields();
   7389        containsHyphen = false;
   7390        tagName = null;
   7391        attributeName = null;
   7392        if (newAttributesEachTime) {
   7393            if (attributes != null) {
   7394                Portability.delete(attributes);
   7395                attributes = null;
   7396            }
   7397        }
   7398    }
   7399 
   7400    public void loadState(Tokenizer other) throws SAXException {
   7401        strBufLen = other.strBufLen;
   7402        if (strBufLen > strBuf.length) {
   7403            strBuf = new char[strBufLen];
   7404        }
   7405        System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
   7406 
   7407        charRefBufLen = other.charRefBufLen;
   7408        System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen);
   7409 
   7410        stateSave = other.stateSave;
   7411        returnStateSave = other.returnStateSave;
   7412        endTagExpectation = other.endTagExpectation;
   7413        endTagExpectationAsArray = other.endTagExpectationAsArray;
   7414        // line = 1; XXX line numbers
   7415        lastCR = other.lastCR;
   7416        index = other.index;
   7417        forceQuirks = other.forceQuirks;
   7418        additional = other.additional;
   7419        entCol = other.entCol;
   7420        firstCharKey = other.firstCharKey;
   7421        lo = other.lo;
   7422        hi = other.hi;
   7423        candidate = other.candidate;
   7424        charRefBufMark = other.charRefBufMark;
   7425        value = other.value;
   7426        seenDigits = other.seenDigits;
   7427        endTag = other.endTag;
   7428        shouldSuspend = false;
   7429        suspendAfterCurrentNonTextToken = false;
   7430        doctypeName = other.doctypeName;
   7431 
   7432        Portability.releaseString(systemIdentifier);
   7433        if (other.systemIdentifier == null) {
   7434            systemIdentifier = null;
   7435        } else {
   7436            systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
   7437        }
   7438 
   7439        Portability.releaseString(publicIdentifier);
   7440        if (other.publicIdentifier == null) {
   7441            publicIdentifier = null;
   7442        } else {
   7443            publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
   7444        }
   7445 
   7446        containsHyphen = other.containsHyphen;
   7447        if (other.tagName == null) {
   7448            tagName = null;
   7449        } else if (other.tagName.isInterned()) {
   7450            tagName = other.tagName;
   7451        } else {
   7452            // In the C++ case, the atoms in the other tokenizer are from a
   7453            // different tokenizer-scoped atom table. Therefore, we have to
   7454            // obtain the correspoding atom from our own atom table.
   7455            nonInternedTagName.setNameForNonInterned(other.tagName.getName()
   7456                    // CPPONLY: , other.tagName.isCustom()
   7457                    );
   7458            tagName = nonInternedTagName;
   7459        }
   7460 
   7461        // [NOCPP[
   7462        attributeName = other.attributeName;
   7463        // ]NOCPP]
   7464        // CPPONLY: if (other.attributeName == null) {
   7465        // CPPONLY:     attributeName = null;
   7466        // CPPONLY: } else if (other.attributeName.isInterned()) {
   7467        // CPPONLY:     attributeName = other.attributeName;
   7468        // CPPONLY: } else {
   7469        // CPPONLY:     // In the C++ case, the atoms in the other tokenizer are from a
   7470        // CPPONLY:     // different tokenizer-scoped atom table. Therefore, we have to
   7471        // CPPONLY:     // obtain the correspoding atom from our own atom table.
   7472        // CPPONLY:     nonInternedAttributeName.setNameForNonInterned(other.attributeName.getLocal(AttributeName.HTML));
   7473        // CPPONLY:     attributeName = nonInternedAttributeName;
   7474        // CPPONLY: }
   7475 
   7476        Portability.delete(attributes);
   7477        if (other.attributes == null) {
   7478            attributes = null;
   7479        } else {
   7480            attributes = other.attributes.cloneAttributes();
   7481        }
   7482    }
   7483 
   7484    public void initializeWithoutStarting() throws SAXException {
   7485        confident = false;
   7486        if (!keepBuffer) {
   7487            strBuf = null;
   7488        }
   7489        line = 1;
   7490        // CPPONLY: attributeLine = 1;
   7491        // [NOCPP[
   7492        metaBoundaryPassed = false;
   7493        wantsComments = tokenHandler.wantsComments();
   7494        if (!newAttributesEachTime) {
   7495            attributes = new HtmlAttributes(mappingLangToXmlLang);
   7496        }
   7497        // ]NOCPP]
   7498        resetToDataState();
   7499    }
   7500 
   7501    protected void errGarbageAfterLtSlash() throws SAXException {
   7502    }
   7503 
   7504    protected void errLtSlashGt() throws SAXException {
   7505    }
   7506 
   7507    protected void errWarnLtSlashInRcdata() throws SAXException {
   7508    }
   7509 
   7510    protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
   7511    }
   7512 
   7513    protected void errCharRefLacksSemicolon() throws SAXException {
   7514    }
   7515 
   7516    protected void errNoDigitsInNCR() throws SAXException {
   7517    }
   7518 
   7519    protected void errGtInSystemId() throws SAXException {
   7520    }
   7521 
   7522    protected void errGtInPublicId() throws SAXException {
   7523    }
   7524 
   7525    protected void errNamelessDoctype() throws SAXException {
   7526    }
   7527 
   7528    protected void errNestedComment() throws SAXException {
   7529    }
   7530 
   7531    protected void errPrematureEndOfComment() throws SAXException {
   7532    }
   7533 
   7534    protected void errBogusComment() throws SAXException {
   7535    }
   7536 
   7537    protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
   7538    }
   7539 
   7540    protected void errSlashNotFollowedByGt() throws SAXException {
   7541    }
   7542 
   7543    protected void errNoSpaceBetweenAttributes() throws SAXException {
   7544    }
   7545 
   7546    protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
   7547            throws SAXException {
   7548    }
   7549 
   7550    protected void errAttributeValueMissing() throws SAXException {
   7551    }
   7552 
   7553    protected void errBadCharBeforeAttributeNameOrNull(char c)
   7554            throws SAXException {
   7555    }
   7556 
   7557    protected void errEqualsSignBeforeAttributeName() throws SAXException {
   7558    }
   7559 
   7560    protected void errBadCharAfterLt(char c) throws SAXException {
   7561    }
   7562 
   7563    protected void errLtGt() throws SAXException {
   7564    }
   7565 
   7566    protected void errProcessingInstruction() throws SAXException {
   7567    }
   7568 
   7569    protected void errUnescapedAmpersandInterpretedAsCharacterReference()
   7570            throws SAXException {
   7571    }
   7572 
   7573    protected void errNotSemicolonTerminated() throws SAXException {
   7574    }
   7575 
   7576    protected void errNoNamedCharacterMatch() throws SAXException {
   7577    }
   7578 
   7579    protected void errQuoteBeforeAttributeName(char c) throws SAXException {
   7580    }
   7581 
   7582    protected void errQuoteOrLtInAttributeNameOrNull(char c)
   7583            throws SAXException {
   7584    }
   7585 
   7586    protected void errExpectedPublicId() throws SAXException {
   7587    }
   7588 
   7589    protected void errBogusDoctype() throws SAXException {
   7590    }
   7591 
   7592    protected void maybeWarnPrivateUseAstral() throws SAXException {
   7593    }
   7594 
   7595    protected void maybeWarnPrivateUse(char ch) throws SAXException {
   7596    }
   7597 
   7598    protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
   7599            throws SAXException {
   7600    }
   7601 
   7602    protected void maybeErrSlashInEndTag(boolean selfClosing)
   7603            throws SAXException {
   7604    }
   7605 
   7606    protected char errNcrNonCharacter(char ch) throws SAXException {
   7607        return ch;
   7608    }
   7609 
   7610    protected void errAstralNonCharacter(int ch) throws SAXException {
   7611    }
   7612 
   7613    protected void errNcrSurrogate() throws SAXException {
   7614    }
   7615 
   7616    protected char errNcrControlChar(char ch) throws SAXException {
   7617        return ch;
   7618    }
   7619 
   7620    protected void errNcrCr() throws SAXException {
   7621    }
   7622 
   7623    protected void errNcrInC1Range() throws SAXException {
   7624    }
   7625 
   7626    protected void errEofInPublicId() throws SAXException {
   7627    }
   7628 
   7629    protected void errEofInComment() throws SAXException {
   7630    }
   7631 
   7632    protected void errEofInDoctype() throws SAXException {
   7633    }
   7634 
   7635    protected void errEofInAttributeValue() throws SAXException {
   7636    }
   7637 
   7638    protected void errEofInAttributeName() throws SAXException {
   7639    }
   7640 
   7641    protected void errEofWithoutGt() throws SAXException {
   7642    }
   7643 
   7644    protected void errEofInTagName() throws SAXException {
   7645    }
   7646 
   7647    protected void errEofInEndTag() throws SAXException {
   7648    }
   7649 
   7650    protected void errEofAfterLt() throws SAXException {
   7651    }
   7652 
   7653    protected void errNcrOutOfRange() throws SAXException {
   7654    }
   7655 
   7656    protected void errNcrUnassigned() throws SAXException {
   7657    }
   7658 
   7659    protected void errDuplicateAttribute() throws SAXException {
   7660    }
   7661 
   7662    protected void errEofInSystemId() throws SAXException {
   7663    }
   7664 
   7665    protected void errExpectedSystemId() throws SAXException {
   7666    }
   7667 
   7668    protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
   7669    }
   7670 
   7671    protected void errNcrControlChar() throws SAXException {
   7672    }
   7673 
   7674    protected void errNcrZero() throws SAXException {
   7675    }
   7676 
   7677    protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
   7678            throws SAXException {
   7679    }
   7680 
   7681    protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
   7682    }
   7683 
   7684    protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
   7685            throws SAXException {
   7686    }
   7687 
   7688    protected void noteAttributeWithoutValue() throws SAXException {
   7689    }
   7690 
   7691    protected void noteUnquotedAttributeValue() throws SAXException {
   7692    }
   7693 
   7694    /**
   7695     * Sets the encodingDeclarationHandler.
   7696     *
   7697     * @param encodingDeclarationHandler
   7698     *            the encodingDeclarationHandler to set
   7699     */
   7700    public void setEncodingDeclarationHandler(
   7701            EncodingDeclarationHandler encodingDeclarationHandler) {
   7702        this.encodingDeclarationHandler = encodingDeclarationHandler;
   7703    }
   7704 
   7705    void destructor() {
   7706        Portability.delete(nonInternedTagName);
   7707        nonInternedTagName = null;
   7708        // CPPONLY: Portability.delete(nonInternedAttributeName);
   7709        // CPPONLY: nonInternedAttributeName = null;
   7710        // The translator will write refcount tracing stuff here
   7711        Portability.delete(attributes);
   7712        attributes = null;
   7713    }
   7714 
   7715    // [NOCPP[
   7716 
   7717    /**
   7718     * Sets an offset to be added to the position reported to
   7719     * <code>TransitionHandler</code>.
   7720     *
   7721     * @param offset the offset
   7722     */
   7723    public void setTransitionBaseOffset(int offset) {
   7724 
   7725    }
   7726 
   7727    // ]NOCPP]
   7728 
   7729 }