Modules (188)

HTMLSimpleDOM

Description

Dependencies

Variables

openImpliesClose

A list of tags whose start causes any of a given set of immediate parent tags to close. This mostly comes from the HTML5 spec section on omitted close tags: http://www.w3.org/html/wg/drafts/html/master/syntax.html#optional-tags This doesn't handle general content model violations.

    var openImpliesClose = {
        li      : { li: true },
        dt      : { dd: true, dt: true },
        dd      : { dd: true, dt: true },
        address : { p: true },
        article : { p: true },
        aside   : { p: true },
        blockquote : { p: true },
        colgroup: { caption: true },
        details : { p: true },
        dir     : { p: true },
        div     : { p: true },
        dl      : { p: true },
        fieldset: { p: true },
        figcaption: { p: true },
        figure  : { p: true },
        footer  : { p: true },
        form    : { p: true },
        h1      : { p: true },
        h2      : { p: true },
        h3      : { p: true },
        h4      : { p: true },
        h5      : { p: true },
        h6      : { p: true },
        header  : { p: true },
        hgroup  : { p: true },
        hr      : { p: true },
        main    : { p: true },
        menu    : { p: true },
        nav     : { p: true },
        ol      : { p: true },
        p       : { p: true },
        pre     : { p: true },
        section : { p: true },
        table   : { p: true },
        ul      : { p: true },
        rb      : { rb: true, rt: true, rtc: true, rp: true },
        rp      : { rb: true, rt: true, rp: true },
        rt      : { rb: true, rt: true, rp: true },
        rtc     : { rb: true, rt: true, rtc: true, rp: true },
        optgroup: { optgroup: true, option: true },
        option  : { option: true },
        tbody   : { caption: true, colgroup: true, thead: true, tbody: true, tfoot: true },
        tfoot   : { caption: true, colgroup: true, thead: true, tbody: true },
        thead   : { caption: true, colgroup: true },
        tr      : { tr: true, th: true, td: true, caption: true },
        th      : { th: true, td: true },
        td      : { th: true, td: true },
        body    : { head: true }
    };

optionalClose

A list of elements which are automatically closed when their parent is closed: http://www.w3.org/html/wg/drafts/html/master/syntax.html#optional-tags

    var optionalClose = {
      html: true,
      body: true,
      li: true,
      dd: true,
      dt: true, // This is not actually correct, but showing a syntax error is not helpful
      p: true,
      rb: true,
      rt: true,
      rtc: true,
      rp: true,
      optgroup: true,
      option: true,
      colgroup: true,
      caption: true,
      tbody: true,
      tfoot: true,
      tr: true,
      td: true,
      th: true
    };

    // TODO: handle optional start tags

voidElements

A list of tags that are self-closing (do not contain other elements). Mostly taken from http://www.w3.org/html/wg/drafts/html/master/syntax.html#void-elements

    var voidElements = {
        area: true,
        base: true,
        basefont: true,
        br: true,
        col: true,
        command: true,
        embed: true,
        frame: true,
        hr: true,
        img: true,
        input: true,
        isindex: true,
        keygen: true,
        link: true,
        menuitem: true,
        meta: true,
        param: true,
        source: true,
        track: true,
        wbr: true
    };

Functions

Private

_addPos

    function _addPos(pos1, pos2) {
        return {line: pos1.line + pos2.line, ch: (pos2.line === 0 ? pos1.ch + pos2.ch : pos2.ch)};
    }
Private Public API

_dumpDOM

root SimpleNode
root of the tree
Returns: string
Text version of the tree.
    function _dumpDOM(root) {
        var result = "",
            indent = "";

        function walk(node) {
            if (node.tag) {
                result += indent + "TAG " + node.tagID + " " + node.tag + " " + JSON.stringify(node.attributes) + "\n";
            } else {
                result += indent + "TEXT " + (node.tagID || "- ") + node.content + "\n";
            }
            if (node.isElement()) {
                indent += "  ";
                node.children.forEach(walk);
                indent = indent.slice(2);
            }
        }
        walk(root);

        return result;
    }

    // Public API
    exports.build                       = build;
    exports.Builder                     = Builder;
    exports.SimpleNode                  = SimpleNode;

    // Private API
    exports._dumpDOM                    = _dumpDOM;
    exports._offsetPos                  = _offsetPos;
    exports._getTextNodeID              = getTextNodeID;
    exports._seed                       = seed;
});
Private Public API

_offsetPos

    function _offsetPos(pos, offset) {
        return {line: pos.line, ch: pos.ch + offset};
    }
Public API

build

Builds a SimpleDOM from the text provided. If strict mode is true, parsing will halt as soon as any error is seen and null will be returned.

text string
Text of document to parse
strict bool
True for strict parsing
Returns: SimpleNode
root of tree or null if strict failed
    function build(text, strict) {
        var builder = new Builder(text);
        return builder.build(strict);
    }
Private

getTextNodeID

textNode Object
new node for which we are generating an ID
Returns: string
ID for the node
    function getTextNodeID(textNode) {
        var childIndex = textNode.parent.children.indexOf(textNode);
        if (childIndex === 0) {
            return textNode.parent.tagID + ".0";
        }
        return textNode.parent.children[childIndex - 1].tagID + "t";
    }

Classes

Constructor

SimpleNode

A SimpleNode represents one node in a SimpleDOM tree. Each node can have any set of properties on it, though there are a couple of assumptions made. Elements will have children and attributes properties. Text nodes will have a content property. All Elements will have a tagID and text nodes can have one.

properties Object
the properties provided will be set on the new object.
    function SimpleNode(properties) {
        $.extend(this, properties);
    }

    SimpleNode.prototype = {

Methods

isElement

Is this node an element node?

Returns: bool
true if it is an element
        isElement: function () {
            return !!this.children;
        },

isText

Is this node a text node?

Returns: bool
true if it is text
        isText: function () {
            return !this.children;
        }
    };

update

Updates signatures used to optimize the number of comparisons done during diffing. This is important to call if you change:

  • children
  • child node attributes
  • text content of a text node
  • child node text
        update: function () {
            if (this.isElement()) {
                var i,
                    subtreeHashes = "",
                    childHashes = "",
                    child;
                for (i = 0; i < this.children.length; i++) {
                    child = this.children[i];
                    if (child.isElement()) {
                        childHashes += String(child.tagID);
                        subtreeHashes += String(child.tagID) + child.attributeSignature + child.subtreeSignature;
                    } else {
                        childHashes += child.textSignature;
                        subtreeHashes += child.textSignature;
                    }
                }
                this.childSignature = MurmurHash3.hashString(childHashes, childHashes.length, seed);
                this.subtreeSignature = MurmurHash3.hashString(subtreeHashes, subtreeHashes.length, seed);
            } else {
                this.textSignature = MurmurHash3.hashString(this.content, this.content.length, seed);
            }
        },

updateAttributeSignature

Updates the signature of this node's attributes. Call this after making attribute changes.

        updateAttributeSignature: function () {
            var attributeString = JSON.stringify(this.attributes);
            this.attributeSignature = MurmurHash3.hashString(attributeString, attributeString.length, seed);
        },
Constructor

Builder

A Builder creates a SimpleDOM tree of SimpleNode objects representing the "important" contents of an HTML document. It does not include things like comments. The nodes include information about their position in the text provided.

text string
The text to parse
startOffset int
starting offset in the text
startOffsetPos {line: int, ch: int}
line/ch position in the text
    function Builder(text, startOffset, startOffsetPos) {
        this.stack = [];
        this.text = text;
        this.t = new Tokenizer(text);
        this.currentTag = null;
        this.startOffset = startOffset || 0;
        this.startOffsetPos = startOffsetPos || {line: 0, ch: 0};
    }

    Builder.prototype._logError = function (token) {
        var error       = { token: token },
            startPos    = token ? (token.startPos || token.endPos) : this.startOffsetPos,
            endPos      = token ? token.endPos : this.startOffsetPos;

        error.startPos = _addPos(this.startOffsetPos, startPos);
        error.endPos = _addPos(this.startOffsetPos, endPos);

        if (!this.errors) {
            this.errors = [];
        }

        this.errors.push(error);
    };

Properties

getID

Returns the best tag ID for the new tag object given. The default implementation just calls getNewID and returns a unique ID.

    Builder.prototype.getID = Builder.prototype.getNewID;

Methods

Public API

build

Builds the SimpleDOM.

strict nullable bool
if errors are detected, halt and return null
markCache nullable Object
a cache that can be used in ID generation (is passed to `getID`)
Returns: SimpleNode
root of tree or null if parsing failed
    Builder.prototype.build = function (strict, markCache) {
        var self = this;
        var token, lastClosedTag, lastTextNode;
        var stack = this.stack;
        var attributeName = null;
        var nodeMap = {};

        markCache = markCache || {};

        // Start timers for building full and partial DOMs.
        // Appropriate timer is used, and the other is discarded.
        var timerBuildFull = "HTMLInstr. Build DOM Full";
        var timerBuildPart = "HTMLInstr. Build DOM Partial";
        var timers; // timer handles
        timers = PerfUtils.markStart([timerBuildFull, timerBuildPart]);
        timerBuildFull = timers[0];
        timerBuildPart = timers[1];

        function closeTag(endIndex, endPos) {
            lastClosedTag = stack[stack.length - 1];
            stack.pop();
            lastClosedTag.update();

            lastClosedTag.end = self.startOffset + endIndex;
            lastClosedTag.endPos = _addPos(self.startOffsetPos, endPos);
        }

        while ((token = this.t.nextToken()) !== null) {
            // lastTextNode is used to glue text nodes together
            // If the last node we saw was text but this one is not, then we're done gluing.
            // If this node is a comment, we might still encounter more text.
            if (token.type !== "text" && token.type !== "comment" && lastTextNode) {
                lastTextNode = null;
            }

            if (token.type === "error") {
                PerfUtils.finalizeMeasurement(timerBuildFull);  // discard
                PerfUtils.addMeasurement(timerBuildPart);       // use
                this._logError(token);
                return null;
            } else if (token.type === "opentagname") {
                var newTagName = token.contents.toLowerCase(),
                    newTag;

                if (openImpliesClose.hasOwnProperty(newTagName)) {
                    var closable = openImpliesClose[newTagName];
                    while (stack.length > 0 && closable.hasOwnProperty(stack[stack.length - 1].tag)) {
                        // Close the previous tag at the start of this tag.
                        // Adjust backwards for the < before the tag name.
                        closeTag(token.start - 1, _offsetPos(token.startPos, -1));
                    }
                }

                newTag = new SimpleNode({
                    tag: token.contents.toLowerCase(),
                    children: [],
                    attributes: {},
                    parent: (stack.length ? stack[stack.length - 1] : null),
                    start: this.startOffset + token.start - 1,
                    startPos: _addPos(this.startOffsetPos, _offsetPos(token.startPos, -1)) // ok because we know the previous char was a "<"
                });
                newTag.tagID = this.getID(newTag, markCache);

                // During undo in particular, it's possible that tag IDs may be reused and
                // the marks in the document may be misleading. If a tag ID has been reused,
                // we apply a new tag ID to ensure that our edits come out correctly.
                if (nodeMap[newTag.tagID]) {
                    newTag.tagID = this.getNewID();
                }

                nodeMap[newTag.tagID] = newTag;
                if (newTag.parent) {
                    newTag.parent.children.push(newTag);
                }
                this.currentTag = newTag;

                if (voidElements.hasOwnProperty(newTag.tag)) {
                    // This is a self-closing element.
                    newTag.update();
                } else {
                    stack.push(newTag);
                }
            } else if (token.type === "opentagend" || token.type === "selfclosingtag") {
                // TODO: disallow <p/>?
                if (this.currentTag) {
                    if (token.type === "selfclosingtag" && stack.length && stack[stack.length - 1] === this.currentTag) {
                        // This must have been a self-closing tag that we didn't identify as a void element
                        // (e.g. an SVG tag). Pop it off the stack as if we had encountered its close tag.
                        closeTag(token.end, token.endPos);
                    } else {
                        // We're ending an open tag. Record the end of the open tag as the end of the
                        // range. (If we later find a close tag for this tag, the end will get overwritten
                        // with the end of the close tag. In the case of a self-closing tag, we should never
                        // encounter that.)
                        // Note that we don't need to update the signature here because the signature only
                        // relies on the tag name and ID, and isn't affected by the tag's attributes, so
                        // the signature we calculated when creating the tag is still the same. If we later
                        // find a close tag for this tag, we'll update the signature to account for its
                        // children at that point (in the next "else" case).
                        this.currentTag.end = this.startOffset + token.end;
                        this.currentTag.endPos = _addPos(this.startOffsetPos, token.endPos);
                        lastClosedTag = this.currentTag;
                        this.currentTag.updateAttributeSignature();
                        this.currentTag = null;
                    }
                }
            } else if (token.type === "closetag") {
                // If this is a self-closing element, ignore the close tag.
                var closeTagName = token.contents.toLowerCase();
                if (!voidElements.hasOwnProperty(closeTagName)) {
                    // Find the topmost item on the stack that matches. If we can't find one, assume
                    // this is just a dangling closing tag and ignore it.
                    var i;
                    for (i = stack.length - 1; i >= 0; i--) {
                        if (stack[i].tag === closeTagName) {
                            break;
                        }
                    }
                    if (i >= 0) {
                        do {
                            // For all tags we're implicitly closing (before we hit the matching tag), we want the
                            // implied end to be the beginning of the close tag (which is two characters, "</", before
                            // the start of the tagname). For the actual tag we're explicitly closing, we want the
                            // implied end to be the end of the close tag (which is one character, ">", after the end of
                            // the tagname).
                            if (stack.length === i + 1) {
                                closeTag(token.end + 1, _offsetPos(token.endPos, 1));
                            } else {
                                if (strict && !optionalClose.hasOwnProperty(stack[stack.length - 1].tag)) {
                                    // If we're in strict mode, treat unbalanced tags as invalid.
                                    PerfUtils.finalizeMeasurement(timerBuildFull);
                                    PerfUtils.addMeasurement(timerBuildPart);
                                    this._logError(token);
                                    return null;
                                }
                                closeTag(token.start - 2, _offsetPos(token.startPos, -2));
                            }
                        } while (stack.length > i);
                    } else {
                        // If we're in strict mode, treat unmatched close tags as invalid. Otherwise
                        // we just silently ignore them.
                        if (strict) {
                            PerfUtils.finalizeMeasurement(timerBuildFull);
                            PerfUtils.addMeasurement(timerBuildPart);
                            this._logError(token);
                            return null;
                        }
                    }
                }
            } else if (token.type === "attribname") {
                attributeName = token.contents.toLowerCase();
                // Set the value to the empty string in case this is an empty attribute. If it's not,
                // it will get overwritten by the attribvalue later.
                this.currentTag.attributes[attributeName] = "";
            } else if (token.type === "attribvalue" && attributeName !== null) {
                this.currentTag.attributes[attributeName] = token.contents;
                attributeName = null;
            } else if (token.type === "text") {
                if (stack.length) {
                    var parent = stack[stack.length - 1];
                    var newNode;

                    // Check to see if we're continuing a previous text.
                    if (lastTextNode) {
                        newNode = lastTextNode;
                        newNode.content += token.contents;
                    } else {
                        newNode = new SimpleNode({
                            parent: stack[stack.length - 1],
                            content: token.contents
                        });
                        parent.children.push(newNode);
                        newNode.tagID = getTextNodeID(newNode);
                        nodeMap[newNode.tagID] = newNode;
                        lastTextNode = newNode;
                    }

                    newNode.update();
                }
            }
        }

        // If we have any tags hanging open, fail the parse if we're in strict mode,
        // otherwise close them at the end of the document.
        while (stack.length) {
            if (strict && !optionalClose.hasOwnProperty(stack[stack.length - 1].tag)) {
                PerfUtils.finalizeMeasurement(timerBuildFull);
                PerfUtils.addMeasurement(timerBuildPart);
                this._logError(token);
                return null;
            }
            closeTag(this.text.length, this.t._indexPos);
        }

        var dom = lastClosedTag;
        if (!dom) {
            // This can happen if the document has no nontrivial content, or if the user tries to
            // have something at the root other than the HTML tag. In all such cases, we treat the
            // document as invalid.
            this._logError(token);
            return null;
        }

        dom.nodeMap = nodeMap;
        PerfUtils.addMeasurement(timerBuildFull);       // use
        PerfUtils.finalizeMeasurement(timerBuildPart);  // discard

        return dom;
    };

getNewID

Returns a new tag ID.

Returns: int
unique tag ID
    Builder.prototype.getNewID = function () {
        return tagID++;
    };