Modules (181)

HTMLTokenizer

Description

Dependencies

This module has no dependencies

Functions

Private

isLegalInAttributeName

c string
the character to test
Returns: boolean
true if c is legal in an HTML attribute name
    function isLegalInAttributeName(c) {
        return c !== '"' && c !== "'" && c !== "<" && c !== "=";
    }
Private

isLegalInTagName

c string
the character to test
Returns: boolean
true if c is legal in an HTML tag name
    function isLegalInTagName(c) {
        // We allow "-" in tag names since they're popular in Angular custom tag names
        // and will be legal in the web components spec.
        return (/[A-Za-z0-9\-]/).test(c);
    }
Private

isLegalInUnquotedAttributeValue

c string
the character to test
Returns: boolean
true if c is legal in an unquoted attribute value
    function isLegalInUnquotedAttributeValue(c) {
        return c !== "<" && c !== "=";
    }

    function _clonePos(pos, offset) {
        return pos ? { line: pos.line, ch: pos.ch + (offset || 0)} : null;
    }
Private

isWhitespace

c string
the character to test
Returns: boolean
true if c is whitespace
    function isWhitespace(c) {
        return c === " " || c === "\t" || c === "\r" || c === "\n";
    }

Classes

Constructor

Tokenizer

A simple HTML tokenizer. See the description of nextToken() for usage details.

text string
The HTML document to tokenize.
    function Tokenizer(text) {
        this._state = TEXT;
        this._buffer = text;
        this._sectionStart = 0;
        this._sectionStartPos = {line: 0, ch: 0};
        this._index = 0;
        this._indexPos = {line: 0, ch: 0};
        this._special = 0; // 1 for script, 2 for style
        this._token = null;
        this._nextToken = null;
    }

Methods

Private

_emitSpecialToken

type string
The token's type (see documentation for `nextToken()`)
index number
If specified, the index to use as the end of the token; uses this._index if not specified
    Tokenizer.prototype._emitSpecialToken = function (type, index, indexPos) {
        // Force the section start to be -1, since these tokens don't have meaningful content--they're
        // just marking particular boundaries we care about (end of an open tag or a self-closing tag).
        this._sectionStart = -1;
        this._sectionStartPos = null;
        this._emitToken(type, index, indexPos);
    };
Private

_emitToken

type string
The token's type (see documentation for `nextToken()`)
index number
If specified, the index to use as the end of the token; uses this._index if not specified
    Tokenizer.prototype._emitToken = function (type, index, indexPos) {
        this._setToken(type, index, indexPos);
        this._sectionStart = -1;
        this._sectionStartPos = null;
    };
Private

_emitTokenIfNonempty

type string
The token's type (see documentation for `nextToken()`)
    Tokenizer.prototype._emitTokenIfNonempty = function (type) {
        if (this._index > this._sectionStart) {
            this._setToken(type);
        }
        this._sectionStart = -1;
        this._sectionStartPos = null;
    };

    exports.Tokenizer = Tokenizer;
});
Private

_setToken

type string
The token's type (see documentation for `nextToken()`)
index number
If specified, the index to use as the end of the token; uses this._index if not specified
    Tokenizer.prototype._setToken = function (type, index, indexPos) {
        if (index === undefined) {
            index = this._index;
        }
        if (indexPos === undefined) {
            indexPos = this._indexPos;
        }
        var token = {
            type: type,
            contents: this._sectionStart === -1 ? "" : this._buffer.substring(this._sectionStart, index),
            start: this._sectionStart,
            end: index,
            startPos: _clonePos(this._sectionStartPos),
            endPos: _clonePos(indexPos)
        };
        if (this._token) {
            // Queue this token to be emitted next. In theory it would be more general to have
            // an arbitrary-length queue, but currently we only ever emit at most two tokens in a
            // single pass through the tokenization loop.
            if (this._nextToken) {
                console.error("HTMLTokenizer: Tried to emit more than two tokens in a single call");
            }
            this._nextToken = token;
        } else {
            this._token = token;
        }
    };

nextToken

Returns the next token in the HTML document, or null if we're at the end of the document.

Returns: ?{type: string,contents: string,start: number,end: number}
token The next token, with the following fields: type: The type of token, one of: "error" - invalid syntax was found, tokenization aborted. Calling nextToken() again will produce undefined results. "text" - contents contains the text "opentagname" - an open tag was started; contents contains the tag name "attribname" - an attribute was encountered; contents contains the attribute name "attribvalue" - the value for the previous attribname was encountered; contents contains the (unquoted) value (Note that attributes like checked and disabled might not have values.) "opentagend" - the end of an open tag was encountered; contents is unspecified "selfclosingtag" - a "/>" was encountered indicating that a void element was self-closed; contents is unspecified (Note that this is optional in HTML; void elements like will end with "opentagend", not "selfclosingtag") "closetag" - a close tag was encountered; contents contains the tag name "comment" - a comment was encountered; contents contains the body of the comment "cdata" - a CDATA block was encountered; contents contains the text inside the block contents: the contents of the token, as specified above. Note that "opentagend" and "selfclosingtag" really specify positions, not tokens, and so have no contents. start: the start index of the token contents within the text, or -1 for "opentagend" and "selfclosingtag" end: the end index of the token contents within the text, or the position of the boundary for "opentagend" and "selfclosingtag"
    Tokenizer.prototype.nextToken = function () {
        this._token = null;

        if (this._nextToken) {
            var result = this._nextToken;
            this._nextToken = null;
            return result;
        }

        while (this._index < this._buffer.length && !this._token) {
            var c = this._buffer.charAt(this._index);
            if (this._state === TEXT) {
                if (c === "<") {
                    this._emitTokenIfNonempty("text");
                    this._state = BEFORE_TAG_NAME;
                    this._startSection();
                }
            } else if (this._state === BEFORE_TAG_NAME) {
                if (c === "/") {
                    this._state = BEFORE_CLOSING_TAG_NAME;
                } else if (c === ">" || this._special > 0) {
                    this._state = TEXT;
                } else {
                    if (c === "!") {
                        this._state = BEFORE_DECLARATION;
                        this._startSection(1);
                    } else if (c === "?") {
                        this._state = IN_PROCESSING_INSTRUCTION;
                        this._startSection(1);
                    } else if (c === "s" || c === "S") {
                        this._state = BEFORE_SPECIAL;
                        this._startSection();
                    } else if (!isLegalInTagName(c)) {
                        this._emitSpecialToken("error");
                        break;
                    } else if (!isWhitespace(c)) {
                        this._state = IN_TAG_NAME;
                        this._startSection();
                    }
                }
            } else if (this._state === IN_TAG_NAME) {
                if (c === "/") {
                    this._emitToken("opentagname");
                    this._emitSpecialToken("selfclosingtag", this._index + 2, _clonePos(this._indexPos, 2));
                    this._state = AFTER_SELFCLOSE_SLASH;
                } else if (c === ">") {
                    this._emitToken("opentagname");
                    this._emitSpecialToken("opentagend", this._index + 1, _clonePos(this._indexPos, 1));
                    this._state = TEXT;
                    this._startSection(1);
                } else if (isWhitespace(c)) {
                    this._emitToken("opentagname");
                    this._state = BEFORE_ATTRIBUTE_NAME;
                } else if (!isLegalInTagName(c)) {
                    this._emitSpecialToken("error");
                    break;
                }
            } else if (this._state === BEFORE_CLOSING_TAG_NAME) {
                if (c === ">") {
                    this._state = TEXT;
                } else if (this._special > 0) {
                    if (c === "s" || c === "S") {
                        this._state = BEFORE_SPECIAL_END;
                    } else {
                        this._state = TEXT;
                        continue;
                    }
                } else if (!isLegalInTagName(c)) {
                    this._emitSpecialToken("error");
                    break;
                } else if (!isWhitespace(c)) {
                    this._state = IN_CLOSING_TAG_NAME;
                    this._startSection();
                }
            } else if (this._state === IN_CLOSING_TAG_NAME) {
                if (c === ">") {
                    this._emitToken("closetag");
                    this._state = TEXT;
                    this._startSection(1);
                    this._special = 0;
                } else if (isWhitespace(c)) {
                    this._emitToken("closetag");
                    this._state = AFTER_CLOSING_TAG_NAME;
                    this._special = 0;
                } else if (!isLegalInTagName(c)) {
                    this._emitSpecialToken("error");
                    break;
                }
            } else if (this._state === AFTER_CLOSING_TAG_NAME) {
                if (c === ">") {
                    this._state = TEXT;
                    this._startSection(1);
                } else if (!isWhitespace(c)) {
                    // There must be only whitespace in the closing tag after the name until the ">".
                    this._emitSpecialToken("error");
                    break;
                }
            } else if (this._state === AFTER_SELFCLOSE_SLASH) {
                // Nothing (even whitespace) can come between the / and > of a self-close.
                if (c === ">") {
                    this._state = TEXT;
                    this._startSection(1);
                } else {
                    this._emitSpecialToken("error");
                    break;
                }