function isLegalInAttributeName(c) {
return c !== '"' && c !== "'" && c !== "<" && c !== "=";
}
function isLegalInTagName(c) {
// We allow "-" in tag names since they're popular in Angular custom tag names
// and will be legal in the web components spec.
return (/[A-Za-z0-9\-]/).test(c);
}
function isLegalInUnquotedAttributeValue(c) {
return c !== "<" && c !== "=";
}
function _clonePos(pos, offset) {
return pos ? { line: pos.line, ch: pos.ch + (offset || 0)} : null;
}
A simple HTML tokenizer. See the description of nextToken() for usage details.
function Tokenizer(text) {
this._state = TEXT;
this._buffer = text;
this._sectionStart = 0;
this._sectionStartPos = {line: 0, ch: 0};
this._index = 0;
this._indexPos = {line: 0, ch: 0};
this._special = 0; // 1 for script, 2 for style
this._token = null;
this._nextToken = null;
}
Tokenizer.prototype._emitSpecialToken = function (type, index, indexPos) {
// Force the section start to be -1, since these tokens don't have meaningful content--they're
// just marking particular boundaries we care about (end of an open tag or a self-closing tag).
this._sectionStart = -1;
this._sectionStartPos = null;
this._emitToken(type, index, indexPos);
};
Tokenizer.prototype._emitToken = function (type, index, indexPos) {
this._setToken(type, index, indexPos);
this._sectionStart = -1;
this._sectionStartPos = null;
};
Tokenizer.prototype._emitTokenIfNonempty = function (type) {
if (this._index > this._sectionStart) {
this._setToken(type);
}
this._sectionStart = -1;
this._sectionStartPos = null;
};
exports.Tokenizer = Tokenizer;
});
Tokenizer.prototype._setToken = function (type, index, indexPos) {
if (index === undefined) {
index = this._index;
}
if (indexPos === undefined) {
indexPos = this._indexPos;
}
var token = {
type: type,
contents: this._sectionStart === -1 ? "" : this._buffer.substring(this._sectionStart, index),
start: this._sectionStart,
end: index,
startPos: _clonePos(this._sectionStartPos),
endPos: _clonePos(indexPos)
};
if (this._token) {
// Queue this token to be emitted next. In theory it would be more general to have
// an arbitrary-length queue, but currently we only ever emit at most two tokens in a
// single pass through the tokenization loop.
if (this._nextToken) {
console.error("HTMLTokenizer: Tried to emit more than two tokens in a single call");
}
this._nextToken = token;
} else {
this._token = token;
}
};
Returns the next token in the HTML document, or null if we're at the end of the document.
Tokenizer.prototype.nextToken = function () {
this._token = null;
if (this._nextToken) {
var result = this._nextToken;
this._nextToken = null;
return result;
}
while (this._index < this._buffer.length && !this._token) {
var c = this._buffer.charAt(this._index);
if (this._state === TEXT) {
if (c === "<") {
this._emitTokenIfNonempty("text");
this._state = BEFORE_TAG_NAME;
this._startSection();
}
} else if (this._state === BEFORE_TAG_NAME) {
if (c === "/") {
this._state = BEFORE_CLOSING_TAG_NAME;
} else if (c === ">" || this._special > 0) {
this._state = TEXT;
} else {
if (c === "!") {
this._state = BEFORE_DECLARATION;
this._startSection(1);
} else if (c === "?") {
this._state = IN_PROCESSING_INSTRUCTION;
this._startSection(1);
} else if (c === "s" || c === "S") {
this._state = BEFORE_SPECIAL;
this._startSection();
} else if (!isLegalInTagName(c)) {
this._emitSpecialToken("error");
break;
} else if (!isWhitespace(c)) {
this._state = IN_TAG_NAME;
this._startSection();
}
}
} else if (this._state === IN_TAG_NAME) {
if (c === "/") {
this._emitToken("opentagname");
this._emitSpecialToken("selfclosingtag", this._index + 2, _clonePos(this._indexPos, 2));
this._state = AFTER_SELFCLOSE_SLASH;
} else if (c === ">") {
this._emitToken("opentagname");
this._emitSpecialToken("opentagend", this._index + 1, _clonePos(this._indexPos, 1));
this._state = TEXT;
this._startSection(1);
} else if (isWhitespace(c)) {
this._emitToken("opentagname");
this._state = BEFORE_ATTRIBUTE_NAME;
} else if (!isLegalInTagName(c)) {
this._emitSpecialToken("error");
break;
}
} else if (this._state === BEFORE_CLOSING_TAG_NAME) {
if (c === ">") {
this._state = TEXT;
} else if (this._special > 0) {
if (c === "s" || c === "S") {
this._state = BEFORE_SPECIAL_END;
} else {
this._state = TEXT;
continue;
}
} else if (!isLegalInTagName(c)) {
this._emitSpecialToken("error");
break;
} else if (!isWhitespace(c)) {
this._state = IN_CLOSING_TAG_NAME;
this._startSection();
}
} else if (this._state === IN_CLOSING_TAG_NAME) {
if (c === ">") {
this._emitToken("closetag");
this._state = TEXT;
this._startSection(1);
this._special = 0;
} else if (isWhitespace(c)) {
this._emitToken("closetag");
this._state = AFTER_CLOSING_TAG_NAME;
this._special = 0;
} else if (!isLegalInTagName(c)) {
this._emitSpecialToken("error");
break;
}
} else if (this._state === AFTER_CLOSING_TAG_NAME) {
if (c === ">") {
this._state = TEXT;
this._startSection(1);
} else if (!isWhitespace(c)) {
// There must be only whitespace in the closing tag after the name until the ">".
this._emitSpecialToken("error");
break;
}
} else if (this._state === AFTER_SELFCLOSE_SLASH) {
// Nothing (even whitespace) can come between the / and > of a self-close.
if (c === ">") {
this._state = TEXT;
this._startSection(1);
} else {
this._emitSpecialToken("error");
break;
}